diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 0e0b84f7e3374..a366db1c580ba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -68,13 +68,15 @@ def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets", def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts", "FlatGlobalInsts", "true", - "Have global_* flat memory instructions" + "Have global_* flat memory instructions", + [FeatureFlatAddressSpace] >; def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", "FlatScratchInsts", "true", - "Have scratch_* flat memory instructions" + "Have scratch_* flat memory instructions", + [FeatureFlatAddressSpace] >; def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", @@ -92,7 +94,8 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", "FlatGVSMode", "true", - "Have GVS addressing mode with flat_* instructions" + "Have GVS addressing mode with flat_* instructions", + [FeatureFlatAddressSpace] >; def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", @@ -934,13 +937,15 @@ def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-glo def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32", "HasAtomicFMinFMaxF32FlatInsts", "true", - "Has flat memory instructions for atomicrmw fmin/fmax for float" + "Has flat memory instructions for atomicrmw fmin/fmax for float", + [FeatureFlatAddressSpace] >; def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64", "HasAtomicFMinFMaxF64FlatInsts", "true", - "Has flat memory instructions for atomicrmw fmin/fmax for double" + "Has flat memory instructions for atomicrmw fmin/fmax for double", + [FeatureFlatAddressSpace] >; def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", @@ -992,7 +997,8 @@ def FeatureFlatAtomicFaddF32Inst : SubtargetFeature<"flat-atomic-fadd-f32-inst", "HasFlatAtomicFaddF32Inst", "true", - "Has flat_atomic_add_f32 instruction" + "Has flat_atomic_add_f32 instruction", + [FeatureFlatAddressSpace] >; def FeatureFlatBufferGlobalAtomicFaddF64Inst diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index bec920380e081..f2e432fa8d7f5 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -51,22 +51,6 @@ class DS_Pseudo patt let Uses = !if(has_m0_read, [M0, EXEC], [EXEC]); } -class DstOperandIsAV { - bit ret = OperandIsAV(OperandList, "vdst")>.ret; -} - -class DstOperandIsAGPR { - bit ret = OperandIsAGPR(OperandList, "vdst")>.ret; -} - -class DataOperandIsAV { - bit ret = OperandIsAV(OperandList, "data0")>.ret; -} - -class DataOperandIsAGPR { - bit ret = OperandIsAGPR(OperandList, "data0")>.ret; -} - class DS_Real : InstSI , Enc64 { @@ -115,13 +99,13 @@ class DS_Real : // register fields are only 8-bit, so data operands must all be AGPR // or VGPR. defvar DstOpIsAV = !if(ps.has_vdst, - DstOperandIsAV.ret, 0); + VDstOperandIsAV.ret, 0); defvar DstOpIsAGPR = !if(ps.has_vdst, - DstOperandIsAGPR.ret, 0); + VDstOperandIsAGPR.ret, 0); defvar DataOpIsAV = !if(!or(ps.has_data0, ps.has_gws_data0), - DataOperandIsAV.ret, 0); + Data0OperandIsAV.ret, 0); defvar DataOpIsAGPR = !if(!or(ps.has_data0, ps.has_gws_data0), - DataOperandIsAGPR.ret, 0); + Data0OperandIsAGPR.ret, 0); bits<1> acc = !if(ps.has_vdst, !if(DstOpIsAV, vdst{9}, DstOpIsAGPR), diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 1617f7954a5ee..fd7c9a741c301 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -137,7 +137,18 @@ class FLAT_Real op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : // unsigned for flat accesses. bits<13> offset; // GFX90A+ only: instruction uses AccVGPR for data - bits<1> acc = !if(ps.has_vdst, vdst{9}, !if(ps.has_data, vdata{9}, 0)); + defvar DstOpIsAV = !if(ps.has_vdst, + VDstOperandIsAV.ret, 0); + defvar DstOpIsAGPR = !if(ps.has_vdst, + VDstOperandIsAGPR.ret, 0); + defvar DataOpIsAV = !if(ps.has_data, + VDataOperandIsAV.ret, 0); + defvar DataOpIsAGPR = !if(ps.has_data, + VDataOperandIsAGPR.ret, 0); + + bits<1> acc = !if(ps.has_vdst, + !if(DstOpIsAV, vdst{9}, DstOpIsAGPR), + !if(DataOpIsAV, vdata{9}, DataOpIsAGPR)); // We don't use tfe right now, and it was removed in gfx9. bits<1> tfe = 0; @@ -297,7 +308,7 @@ multiclass FLAT_Flat_Store_Pseudo_t16 { multiclass FLAT_Global_Load_Pseudo { - let is_flat_global = 1 in { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Load_Pseudo, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Load_Pseudo, @@ -347,7 +358,7 @@ multiclass FLAT_Global_Load_AddTid_Pseudo { - let is_flat_global = 1 in { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Store_Pseudo, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Store_Pseudo, @@ -860,6 +871,30 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN< let enabled_saddr = 1; let FPAtomic = data_vt.isFP; } + + defvar vdst_op_agpr = getEquivalentAGPROperand.ret; + defvar data_op_agpr = getEquivalentAGPROperand.ret; + + let SubtargetPredicate = isGFX90APlus in { + def _RTN_agpr : FLAT_AtomicRet_Pseudo , + GlobalSaddrTable<0, opName#"_rtn_agpr"> { + let has_saddr = 1; + let FPAtomic = data_vt.isFP; + } + + def _SADDR_RTN_agpr : FLAT_AtomicRet_Pseudo , + GlobalSaddrTable<1, opName#"_rtn_agpr"> { + let has_saddr = 1; + let enabled_saddr = 1; + let FPAtomic = data_vt.isFP; + } + } } } @@ -1043,8 +1078,12 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { let SubtargetPredicate = isGFX12Plus in { defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPROp_32, i32>; - defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPROp_32, i32>; -} // End SubtargetPredicate = isGFX12Plus + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>; +} + +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in { + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_NO_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>; +} defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte">; defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte">; @@ -1296,19 +1335,19 @@ let SubtargetPredicate = isGFX10Plus in { FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", AVLdSt_64, f64, v2f64, AVLdSt_128>; } // End SubtargetPredicate = isGFX10Plus -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_add_f32", AVLdSt_32, f32 >; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_pk_add_f16", AVLdSt_32, v2f16 >; -let OtherPredicates = [HasAtomicFaddRtnInsts] in +let SubtargetPredicate = HasAtomicFaddRtnInsts in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_add_f32", AVLdSt_32, f32 >; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_pk_add_f16", AVLdSt_32, v2f16 >; @@ -1442,8 +1481,10 @@ class FlatStoreSaddrPat : GCNPat < (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)), - (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset, $cpol) ->; + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset, $cpol)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; +} class GlobalAtomicNoRtnSaddrPat : GCNPat < @@ -1469,19 +1510,24 @@ class FlatStoreSignedAtomicPat .ret:$data, $offset) >; -multiclass FlatAtomicNoRtnPatBase { - + defvar inst = !cast(base_inst_name); + defvar inst_saddr = !cast(inst#"_SADDR"); defvar noRtnNode = !cast(node); let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; + } - def : FlatAtomicSaddrPat(inst#"_SADDR"), !cast(node), + def : FlatAtomicSaddrPat(node), GlobalSAddr, vt, data_vt> { let AddedComplexity = 9; - let SubtargetPredicate = HasFlatGVSMode; + let SubtargetPredicate = inst_saddr.SubtargetPredicate; + let OtherPredicates = inst_saddr.OtherPredicates; } } @@ -1494,17 +1540,22 @@ multiclass FlatAtomicNoRtnPat ; -multiclass FlatAtomicRtnPatBase { - + defvar inst = !cast(inst_name#"_RTN"); + defvar inst_saddr = !cast(inst_name#"_SADDR_RTN"); defvar rtnNode = !cast(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; + } - def : FlatAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> { + def : FlatAtomicSaddrPat { let AddedComplexity = 8; - let SubtargetPredicate = HasFlatGVSMode; + let SubtargetPredicate = inst_saddr.SubtargetPredicate; + let OtherPredicates = inst_saddr.OtherPredicates; } } @@ -1540,8 +1591,10 @@ multiclass FlatAtomicIntrPat : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; +} multiclass FlatSignedAtomicPat { multiclass GlobalFLATLoadPats { def : FlatLoadSignedPat { let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } def : FlatLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } } multiclass GlobalFLATLoadPats_M0 { def : FlatLoadSignedPat_M0 { let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } def : GlobalLoadSaddrPat_M0(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } } multiclass GlobalFLATLoadPats_CPOL { def : FlatLoadSignedPat_CPOL { let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } def : GlobalLoadSaddrPat_CPOL(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } } @@ -1701,10 +1766,14 @@ multiclass GlobalFLATLoadPats_D16_t16 { def : FlatStoreSignedPat { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; let AddedComplexity = 10; } def : FlatStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; let AddedComplexity = 11; } } @@ -1849,7 +1918,9 @@ multiclass ScratchFLATLoadPats_D16_t16 { - def : FlatLoadPat ; + def : FlatLoadPat { + let OtherPredicates = [HasFlatAddressSpace]; + } def : FlatLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 9; @@ -1876,7 +1947,9 @@ multiclass FlatLoadPats_D16_t16 { - def : FlatStorePat ; + def : FlatStorePat { + let OtherPredicates = [HasFlatAddressSpace]; + } def : FlatStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 9; @@ -1893,8 +1966,6 @@ multiclass FlatStorePats_t16; defm : FlatLoadPats ; defm : FlatLoadPats ; @@ -2018,12 +2089,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; defm : FlatStorePats ; defm : FlatStorePats ; -} // End OtherPredicates = [HasFlatAddressSpace] - -let OtherPredicates = [isGFX12Plus] in defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; - -let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; let OtherPredicates = [HasD16LoadStore] in { @@ -2048,8 +2114,6 @@ defm : FlatLoadPats_D16 ; defm : FlatLoadPats_D16 ; } -let OtherPredicates = [HasFlatGlobalInsts] in { - defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; @@ -2063,7 +2127,7 @@ defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in -let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in { +let True16Predicate = p in { defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; @@ -2077,7 +2141,7 @@ defm : GlobalFLATLoadPats ; } -let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>; @@ -2174,7 +2238,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_glo defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; -let OtherPredicates = [HasAtomicCSubNoRtnInsts] in +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; @@ -2194,7 +2258,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i let SubtargetPredicate = isGFX12Plus in { defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; } @@ -2249,62 +2313,38 @@ let OtherPredicates = [isGFX1250Plus] in { defm : GlobalStoreLDSPats ; } -let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -} - -let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in { defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; -} -let OtherPredicates = [isGFX12Only] in { - // FIXME: Remove these intrinsics - defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; - defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; - defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; - defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; +// FIXME: Remove these intrinsics +let SubtargetPredicate = isGFX12Only in { +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -} -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; -} -let OtherPredicates = [HasAtomicFaddRtnInsts] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -} -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; -} -let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -} -let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; -} -let OtherPredicates = [HasFlatAtomicFaddF32Inst] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -} - -let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; -} -let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; -} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { @@ -2632,8 +2672,10 @@ multiclass FLAT_Global_Real_Atomics_vi op, FLAT_Real_AllAddr_vi { def _RTN_vi : FLAT_Real_vi (NAME#"_RTN"), has_sccb>; def _SADDR_RTN_vi : FLAT_Real_vi (NAME#"_SADDR_RTN"), has_sccb>; -} + def _RTN_agpr_vi : FLAT_Real_vi (NAME#"_RTN_agpr"), has_sccb>; + def _SADDR_RTN_agpr_vi : FLAT_Real_vi (NAME#"_SADDR_RTN_agpr"), has_sccb>; +} defm FLAT_ATOMIC_SWAP : FLAT_Real_Atomics_vi <0x40>; defm FLAT_ATOMIC_CMPSWAP : FLAT_Real_Atomics_vi <0x41>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 8c2bd3d3962ce..d9746a17e75eb 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1492,3 +1492,27 @@ class OperandIsVGPR { defvar reg_class = getRegClassFromOp.ret; bit ret = !and(reg_class.HasVGPR, !not(reg_class.HasAGPR)); } + +class VDstOperandIsAV { + bit ret = OperandIsAV(OperandList, "vdst")>.ret; +} + +class VDstOperandIsAGPR { + bit ret = OperandIsAGPR(OperandList, "vdst")>.ret; +} + +class Data0OperandIsAV { + bit ret = OperandIsAV(OperandList, "data0")>.ret; +} + +class Data0OperandIsAGPR { + bit ret = OperandIsAGPR(OperandList, "data0")>.ret; +} + +class VDataOperandIsAV { + bit ret = OperandIsAV(OperandList, "vdata")>.ret; +} + +class VDataOperandIsAGPR { + bit ret = OperandIsAGPR(OperandList, "vdata")>.ret; +} diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll new file mode 100644 index 0000000000000..6b6eb43baf856 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll @@ -0,0 +1,21350 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX950 %s + +;--------------------------------------------------------------------- +; xchg i32 cases +;--------------------------------------------------------------------- + +; Input and result use AGPR +define void @flat_atomic_xchg_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is AGPR, result used as VGPR. +define void @flat_atomic_xchg_i32_ret_a_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm "; use $0", "v"(i32 %result) + ret void +} + +; Input is VGPR, result used as AGPR +define void @flat_atomic_xchg_i32_ret_v_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=v"() + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is AV, result also used as AV +define void @flat_atomic_xchg_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +; Input is AV, used as v +define void @flat_atomic_xchg_i32_ret_av_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm "; use $0", "v"(i32 %result) + ret void +} + +; Input is AV, used as a +define void @flat_atomic_xchg_i32_ret_av_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is a, result used as AV +define void @flat_atomic_xchg_i32_ret_a_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +; Input is v, result used as AV +define void @flat_atomic_xchg_i32_ret_v_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=v"() + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_xchg_i32_ret_av_av_no_agprs(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_ret_av_av_no_agprs: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"() + %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0 + %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1 + %result = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1) + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_xchg_i32_noret_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v[0:1], a0 offset:40 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v[0:1], a0 offset:40 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %unused = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + ret void +} + +define void @flat_atomic_xchg_i32_noret_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap v[0:1], v2 offset:40 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap v[0:1], v2 offset:40 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %unused = atomicrmw xchg ptr %gep.0, i32 %data seq_cst + ret void +} + +;--------------------------------------------------------------------- +; xchg i64 cases +;--------------------------------------------------------------------- + +; Input and result use AGPR +define void @flat_atomic_xchg_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB11_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB11_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB11_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB11_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB11_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $agpr2_agpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB11_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB11_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 v0, a[2:3], off +; GFX950-NEXT: .LBB11_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is AGPR, result used as VGPR. +define void @flat_atomic_xchg_i64_ret_a_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB12_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB12_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB12_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB12_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB12_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB12_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB12_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off +; GFX950-NEXT: .LBB12_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst + call void asm "; use $0", "v"(i64 %result) + ret void +} + +; Input is VGPR, result used as AGPR +define void @flat_atomic_xchg_i64_ret_v_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB13_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB13_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB13_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB13_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB13_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB13_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB13_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-NEXT: .LBB13_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=v"() + %result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is AV, result also used as AV +define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB14_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB14_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB14_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB14_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB14_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB14_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB14_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-NEXT: .LBB14_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +; Input is AV, used as v +define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB15_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB15_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB15_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB15_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB15_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB15_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB15_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-NEXT: .LBB15_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst + call void asm "; use $0", "v"(i64 %result) + ret void +} + +; Input is AV, used as a +define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB16_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB16_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword a0, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB16_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB16_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB16_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB16_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is a, result used as AV +define void @flat_atomic_xchg_i64_ret_a_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB17_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB17_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB17_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB17_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB17_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB17_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB17_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 v2, a[0:1], off +; GFX950-NEXT: .LBB17_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +; Input is v, result used as AV +define void @flat_atomic_xchg_i64_ret_v_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB18_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB18_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB18_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB18_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB18_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB18_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB18_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-NEXT: .LBB18_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=v"() + %result = atomicrmw xchg ptr %gep.0, i64 %data seq_cst + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_xchg_i64_noret_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB19_4 +; GFX90A-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], a[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB19_2 +; GFX90A-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB19_3 +; GFX950-NEXT: ; %bb.1: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB19_4 +; GFX950-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; GFX950-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], a[0:1] sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB19_2 +; GFX950-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v0, a[0:1], off +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %unused = atomicrmw xchg ptr %ptr, i64 %data seq_cst + ret void +} + +define void @flat_atomic_xchg_i64_noret_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB20_4 +; GFX90A-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB20_2 +; GFX90A-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB20_3 +; GFX950-NEXT: ; %bb.1: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB20_4 +; GFX950-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; GFX950-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB20_2 +; GFX950-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %unused = atomicrmw xchg ptr %ptr, i64 %data seq_cst + ret void +} + +;--------------------------------------------------------------------- +; xor i32 cases with cmpxchg expansion +;--------------------------------------------------------------------- + +; Input and result use AGPR +define void @flat_atomic_xor_expansion_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB21_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is AGPR, result used as VGPR. +define void @flat_atomic_xor_expansion_i32_ret_a_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB22_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm "; use $0", "v"(i32 %result) + ret void +} + +; Input is VGPR, result used as AGPR +define void @flat_atomic_xor_expansion_i32_ret_v_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB23_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=v"() + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is AV, result also used as AV +define void @flat_atomic_xor_expansion_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB24_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +; Input is AV, used as v +define void @flat_atomic_xor_expansion_i32_ret_av_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB25_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm "; use $0", "v"(i32 %result) + ret void +} + +; Input is AV, used as a +define void @flat_atomic_xor_expansion_i32_ret_av_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB26_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is a, result used as AV +define void @flat_atomic_xor_expansion_i32_ret_a_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB27_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +; Input is v, result used as AV +define void @flat_atomic_xor_expansion_i32_ret_v_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB28_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=v"() + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33 +; GFX90A-NEXT: flat_load_dword v1, v[4:5] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v1, v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v1 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a32 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_ret_av_av_no_agprs: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill +; GFX950-NEXT: v_accvgpr_write_b32 a33, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a32, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX950-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX950-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX950-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX950-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX950-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX950-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX950-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX950-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX950-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX950-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX950-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX950-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX950-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX950-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX950-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a33 +; GFX950-NEXT: flat_load_dword v1, v[4:5] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v1, v[4:5], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB29_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a32, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX950-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX950-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX950-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX950-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX950-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX950-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX950-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX950-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX950-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX950-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX950-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX950-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX950-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX950-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX950-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX950-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX950-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX950-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX950-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX950-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX950-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX950-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX950-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a32 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"() + %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0 + %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1 + %result = atomicrmw xor ptr %ptr, i32 %data seq_cst + call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1) + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_xor_expansion_i32_noret_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB30_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %unused = atomicrmw xor ptr %ptr, i32 %data seq_cst + ret void +} + +define void @flat_atomic_xor_expansion_i32_noret_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i32_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i32_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB31_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %unused = atomicrmw xor ptr %ptr, i32 %data seq_cst + ret void +} + +;--------------------------------------------------------------------- +; xor i64 cases with cmpxchg expansion +;--------------------------------------------------------------------- + +; Input and result use AGPR +define void @flat_atomic_xor_expansion_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB32_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB32_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB32_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB32_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB32_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB32_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB32_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB32_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: .LBB32_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB32_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB32_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i64 %data seq_cst + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is AGPR, result used as VGPR. +define void @flat_atomic_xor_expansion_i64_ret_a_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB33_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB33_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB33_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB33_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB33_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB33_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB33_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB33_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB33_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB33_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB33_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i64 %data seq_cst + call void asm "; use $0", "v"(i64 %result) + ret void +} + +; Input is VGPR, result used as AGPR +define void @flat_atomic_xor_expansion_i64_ret_v_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB34_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB34_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB34_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB34_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB34_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB34_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[6:7] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB34_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB34_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB34_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: .LBB34_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB34_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB34_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=v"() + %result = atomicrmw xor ptr %ptr, i64 %data seq_cst + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is AV, result also used as AV +define void @flat_atomic_xor_expansion_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB35_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB35_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB35_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB35_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB35_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB35_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB35_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB35_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB35_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB35_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB35_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB35_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i64 %data seq_cst + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +; Input is AV, used as v +define void @flat_atomic_xor_expansion_i64_ret_av_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB36_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB36_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB36_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB36_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB36_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB36_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB36_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB36_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB36_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB36_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB36_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB36_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i64 %data seq_cst + call void asm "; use $0", "v"(i64 %result) + ret void +} + +; Input is AV, used as a +define void @flat_atomic_xor_expansion_i64_ret_av_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB37_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB37_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB37_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB37_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB37_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB37_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[6:7] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB37_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB37_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB37_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: .LBB37_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB37_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB37_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i64 %data seq_cst + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is a, result used as AV +define void @flat_atomic_xor_expansion_i64_ret_a_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB38_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB38_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB38_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB38_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB38_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB38_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB38_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB38_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB38_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB38_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB38_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB38_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i64 %data seq_cst + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +; Input is v, result used as AV +define void @flat_atomic_xor_expansion_i64_ret_v_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB39_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB39_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB39_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB39_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB39_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB39_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB39_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB39_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB39_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB39_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v4, v2 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB39_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=v"() + %result = atomicrmw xor ptr %ptr, i64 %data seq_cst + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_xor_expansion_i64_noret_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB40_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB40_6 +; GFX90A-NEXT: .LBB40_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB40_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB40_4: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB40_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB40_2 +; GFX90A-NEXT: .LBB40_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v7 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB40_3 +; GFX950-NEXT: ; %bb.1: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB40_6 +; GFX950-NEXT: .LBB40_2: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; GFX950-NEXT: .LBB40_3: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB40_4: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB40_4 +; GFX950-NEXT: ; %bb.5: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB40_2 +; GFX950-NEXT: .LBB40_6: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v1, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v0, v6 +; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %unused = atomicrmw xor ptr %ptr, i64 %data seq_cst + ret void +} + +define void @flat_atomic_xor_expansion_i64_noret_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_expansion_i64_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB41_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB41_6 +; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB41_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB41_4: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB41_4 +; GFX90A-NEXT: ; %bb.5: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB41_2 +; GFX90A-NEXT: .LBB41_6: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v7 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_expansion_i64_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[6:7] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB41_3 +; GFX950-NEXT: ; %bb.1: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB41_6 +; GFX950-NEXT: .LBB41_2: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; GFX950-NEXT: .LBB41_3: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB41_4: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB41_4 +; GFX950-NEXT: ; %bb.5: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB41_2 +; GFX950-NEXT: .LBB41_6: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v1, v7 +; GFX950-NEXT: v_xor_b32_e32 v0, v0, v6 +; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %unused = atomicrmw xor ptr %ptr, i64 %data seq_cst + ret void +} + +;--------------------------------------------------------------------- +; xor i32 cases with instruction +;--------------------------------------------------------------------- + +; Input and result use AGPR +define void @flat_atomic_xor_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is AGPR, result used as VGPR. +define void @flat_atomic_xor_i32_ret_a_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "v"(i32 %result) + ret void +} + +; Input is VGPR, result used as AGPR +define void @flat_atomic_xor_i32_ret_v_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=v"() + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is AV, result also used as AV +define void @flat_atomic_xor_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +; Input is AV, used as v +define void @flat_atomic_xor_i32_ret_av_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "v"(i32 %result) + ret void +} + +; Input is AV, used as a +define void @flat_atomic_xor_i32_ret_av_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is a, result used as AV +define void @flat_atomic_xor_i32_ret_a_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +; Input is v, result used as AV +define void @flat_atomic_xor_i32_ret_v_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=v"() + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_xor_i32_ret_av_av_no_agprs(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_ret_av_av_no_agprs: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"() + %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0 + %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1 + %result = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1) + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_xor_i32_noret_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_xor v[0:1], a0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v[0:1], a0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %unused = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @flat_atomic_xor_i32_noret_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_xor v[0:1], v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor v[0:1], v2 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %unused = atomicrmw xor ptr %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +;--------------------------------------------------------------------- +; xor i64 cases with instruction +;--------------------------------------------------------------------- + +; Input and result use AGPR +define void @flat_atomic_xor_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB53_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB53_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB53_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB53_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB53_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB53_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB53_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB53_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is AGPR, result used as VGPR. +define void @flat_atomic_xor_i64_ret_a_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB54_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB54_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB54_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB54_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB54_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB54_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB54_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB54_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "v"(i64 %result) + ret void +} + +; Input is VGPR, result used as AGPR +define void @flat_atomic_xor_i64_ret_v_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB55_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB55_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB55_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB55_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB55_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB55_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB55_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB55_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=v"() + %result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is AV, result also used as AV +define void @flat_atomic_xor_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB56_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB56_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB56_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB56_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB56_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB56_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB56_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB56_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +; Input is AV, used as v +define void @flat_atomic_xor_i64_ret_av_v(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB57_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB57_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB57_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB57_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB57_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB57_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB57_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB57_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "v"(i64 %result) + ret void +} + +; Input is AV, used as a +define void @flat_atomic_xor_i64_ret_av_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB58_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB58_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB58_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB58_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB58_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB58_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB58_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB58_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is a, result used as AV +define void @flat_atomic_xor_i64_ret_a_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB59_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB59_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB59_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB59_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB59_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB59_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB59_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB59_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +; Input is v, result used as AV +define void @flat_atomic_xor_i64_ret_v_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB60_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB60_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB60_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v4, v2, v4 +; GFX90A-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB60_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB60_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB60_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB60_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v5 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v4 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB60_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=v"() + %result = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_xor_i64_noret_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB61_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB61_4 +; GFX90A-NEXT: .LBB61_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], a[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB61_2 +; GFX90A-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB61_3 +; GFX950-NEXT: ; %bb.1: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB61_4 +; GFX950-NEXT: .LBB61_2: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; GFX950-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], a[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB61_2 +; GFX950-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %unused = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @flat_atomic_xor_i64_noret_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB62_3 +; GFX90A-NEXT: ; %bb.1: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB62_4 +; GFX90A-NEXT: .LBB62_2: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; GFX90A-NEXT: .LBB62_3: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB62_2 +; GFX90A-NEXT: .LBB62_4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB62_3 +; GFX950-NEXT: ; %bb.1: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB62_4 +; GFX950-NEXT: .LBB62_2: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] +; GFX950-NEXT: .LBB62_3: ; %atomicrmw.global +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB62_2 +; GFX950-NEXT: .LBB62_4: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %unused = atomicrmw xor ptr %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +;--------------------------------------------------------------------- +; other atomics i32, with aa+av cases +;--------------------------------------------------------------------- + +define void @flat_atomic_add_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_add_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_add_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_add_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_add_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_add_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_sub_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_sub_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_sub_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_sub_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_sub_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_sub_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_and_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_and_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_and_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_and_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_and_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_and_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_nand_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_nand_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB69_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_nand_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_nand_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2 +; GFX90A-NEXT: v_not_b32_e32 v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_nand_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB70_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_or_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_or_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_or_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_or_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_or_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_or_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_max_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_max_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_max_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_max_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_max_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_max_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_min_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_min_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_min_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_min_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_min_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_min_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_umax_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umax_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umax_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_umax_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umax_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umax_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_umin_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umin_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umin_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_umin_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umin_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umin_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_uinc_wrap_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_uinc_wrap_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_uinc_wrap_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_uinc_wrap_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_udec_wrap_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_udec_wrap_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_udec_wrap_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_udec_wrap_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_udec_wrap_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_udec_wrap_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_usub_cond_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_cond_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_cond_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB85_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_usub_cond_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_cond_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_cond_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v3 +; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB86_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_sat_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_sat_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB87_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_sat_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_sat_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB88_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics i64, with aa+av cases +;--------------------------------------------------------------------- + +define void @flat_atomic_add_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_add_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB89_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB89_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB89_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB89_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_add_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB89_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB89_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB89_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB89_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_add_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_add_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB90_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB90_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB90_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB90_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_add_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB90_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB90_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB90_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB90_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_sub_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_sub_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB91_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB91_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB91_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB91_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_sub_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB91_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB91_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB91_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB91_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_sub_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_sub_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB92_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB92_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB92_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB92_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_sub_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB92_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB92_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB92_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB92_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_and_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_and_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB93_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB93_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB93_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB93_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_and_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB93_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB93_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB93_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB93_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_and_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_and_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB94_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB94_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB94_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB94_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_and_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB94_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB94_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB94_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB94_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_nand_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_nand_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB95_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB95_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, v3, v7 +; GFX90A-NEXT: v_and_b32_e32 v8, v2, v6 +; GFX90A-NEXT: v_not_b32_e32 v1, v0 +; GFX90A-NEXT: v_not_b32_e32 v0, v8 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB95_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB95_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB95_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v7 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, v2, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_not_b32_e32 v2, v3 +; GFX90A-NEXT: v_not_b32_e32 v3, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB95_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_nand_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB95_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB95_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v8, v2, v6 +; GFX950-NEXT: v_not_b32_e32 v1, v0 +; GFX950-NEXT: v_not_b32_e32 v0, v8 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB95_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: .LBB95_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB95_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, v1, v7 +; GFX950-NEXT: v_and_b32_e32 v5, v0, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_not_b32_e32 v3, v2 +; GFX950-NEXT: v_not_b32_e32 v2, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB95_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_nand_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_nand_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB96_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[2:3] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB96_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_and_b32_e32 v4, v7, v1 +; GFX90A-NEXT: v_and_b32_e32 v8, v6, v0 +; GFX90A-NEXT: v_not_b32_e32 v5, v4 +; GFX90A-NEXT: v_not_b32_e32 v4, v8 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB96_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB96_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB96_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX90A-NEXT: v_not_b32_e32 v0, v0 +; GFX90A-NEXT: v_not_b32_e32 v1, v1 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB96_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_nand_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB96_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB96_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_and_b32_e32 v2, v9, v1 +; GFX950-NEXT: v_and_b32_e32 v3, v8, v0 +; GFX950-NEXT: v_not_b32_e32 v7, v2 +; GFX950-NEXT: v_not_b32_e32 v6, v3 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB96_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB96_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB96_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_not_b32_e32 v1, v1 +; GFX950-NEXT: v_not_b32_e32 v0, v0 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: .LBB96_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_or_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_or_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB97_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB97_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB97_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB97_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_or_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB97_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB97_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB97_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB97_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_or_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_or_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB98_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB98_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB98_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX90A-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB98_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_or_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB98_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB98_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB98_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB98_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_max_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_max_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB99_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB99_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB99_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB99_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_max_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB99_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB99_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB99_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: .LBB99_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_max_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_max_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB100_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB100_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB100_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB100_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_max_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB100_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB100_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB100_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB100_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_min_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_min_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB101_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB101_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB101_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB101_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_min_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB101_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB101_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB101_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: .LBB101_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_min_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_min_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB102_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB102_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB102_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB102_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_min_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB102_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB102_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB102_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB102_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_umax_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umax_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB103_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB103_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB103_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB103_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umax_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB103_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB103_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB103_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: .LBB103_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_umax_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umax_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB104_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB104_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB104_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB104_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umax_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB104_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB104_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB104_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB104_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_umin_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umin_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB105_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB105_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB105_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB105_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umin_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB105_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB105_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB105_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: .LBB105_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_umin_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umin_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB106_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB106_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB106_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB106_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umin_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB106_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB106_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB106_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB106_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_uinc_wrap_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB107_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB107_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB107_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB107_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB107_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB107_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB107_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB107_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_uinc_wrap_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB108_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB108_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB108_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB108_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_uinc_wrap_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB108_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB108_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB108_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB108_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_udec_wrap_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB109_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB109_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB109_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB109_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB109_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB109_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB109_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 +; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB109_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_udec_wrap_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_udec_wrap_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB110_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB110_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB110_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] +; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB110_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_udec_wrap_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB110_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB110_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB110_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB110_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_usub_cond_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB111_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB111_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB111_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB111_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB111_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB111_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB111_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB111_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB111_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB111_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB111_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB111_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_usub_cond_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_cond_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB112_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[2:3] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB112_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v1, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB112_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB112_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB112_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v4, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v1, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB112_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_cond_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB112_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB112_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB112_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB112_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB112_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v1, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: .LBB112_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_usub_sat_i64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB113_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB113_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB113_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB113_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB113_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB113_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB113_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB113_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v7, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB113_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: .LBB113_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB113_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v7, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB113_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_usub_sat_i64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_sat_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB114_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB114_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB114_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB114_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB114_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v3, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB114_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_sat_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB114_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB114_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB114_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB114_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB114_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-NEXT: .LBB114_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics f32, with aa+av cases +;--------------------------------------------------------------------- + +define void @flat_atomic_fadd_f32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB115_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB115_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc +; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB115_3: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB115_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: .LBB115_5: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: .LBB115_6: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB115_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: .LBB115_8: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fadd_f32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB116_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr2 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB116_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f32 v2, v[0:1], v3, off glc +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr3 +; GFX90A-NEXT: .LBB116_3: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB116_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f32_e32 v1, v2, v3 +; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX90A-NEXT: .LBB116_5: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr3 +; GFX90A-NEXT: .LBB116_6: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB116_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ds_add_rtn_f32 v2, v0, v3 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB116_8: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fsub_f32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB117_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fsub_f32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB118_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fmax_f32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX950-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB119_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fmax_f32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB120_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fmin_f32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX950-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB121_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fmin_f32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB122_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB123_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB124_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB125_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB126_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics f64, with aa+av cases +;--------------------------------------------------------------------- + +define void @flat_atomic_fadd_f64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB127_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB127_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB127_3: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB127_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB127_5: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB127_6: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB127_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: .LBB127_8: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB127_6 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX950-NEXT: s_cbranch_execz .LBB127_3 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB127_3: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX950-NEXT: s_cbranch_execz .LBB127_5 +; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB127_5: ; %Flow1 +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB127_6: ; %Flow2 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB127_8 +; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: .LBB127_8: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fadd_f64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB128_6 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v3 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB128_3 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off glc +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB128_3: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX90A-NEXT: s_cbranch_execz .LBB128_5 +; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB128_5: ; %Flow1 +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB128_6: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB128_8 +; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB128_8: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_shared_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB128_6 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s3, v3 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX950-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX950-NEXT: s_cbranch_execz .LBB128_3 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[2:3], v[4:5], off sc0 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB128_3: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; GFX950-NEXT: s_cbranch_execz .LBB128_5 +; GFX950-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB128_5: ; %Flow1 +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB128_6: ; %Flow2 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB128_8 +; GFX950-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: .LBB128_8: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB129_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB129_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB129_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: .LBB129_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB129_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB129_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB129_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB129_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[6:7] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB129_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: .LBB129_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB129_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[6:7] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-NEXT: .LBB129_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB130_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB130_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB130_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB130_4: ; %Flow3 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB130_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB130_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB130_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB130_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB130_4: ; %Flow3 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB130_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[4:5], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[4:5], -v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB130_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fmax_f64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB131_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB131_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB131_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB131_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB131_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB131_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB131_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB131_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fmax_f64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB132_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB132_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB132_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB132_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB132_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_max_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB132_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB132_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB132_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fmin_f64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB133_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB133_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB133_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB133_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB133_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB133_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB133_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB133_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fmin_f64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB134_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3], v[4:5] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB134_2: ; %Flow +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB134_4 +; GFX90A-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB134_4: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB134_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_atomic_min_f64 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB134_2: ; %Flow +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB134_4 +; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB134_4: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB135_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX90A-NEXT: .LBB135_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB135_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB135_4: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB135_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB135_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB135_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: .LBB135_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB135_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB135_4: ; %Flow2 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB135_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB135_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB136_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: .LBB136_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB136_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB136_4: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB136_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB136_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB136_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: .LBB136_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB136_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB136_4: ; %Flow2 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB136_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB136_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v7 +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB137_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX90A-NEXT: .LBB137_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB137_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: .LBB137_4: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB137_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB137_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v7 +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB137_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: .LBB137_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB137_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: .LBB137_4: ; %Flow2 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB137_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-NEXT: .LBB137_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x50, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX90A-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB138_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: .LBB138_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB138_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB138_4: ; %Flow2 +; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX90A-NEXT: s_cbranch_execz .LBB138_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB138_6: ; %atomicrmw.phi +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b64 s[0:1], 0x50 +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB138_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: .LBB138_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB138_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB138_4: ; %Flow2 +; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-NEXT: s_cbranch_execz .LBB138_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v6, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-NEXT: .LBB138_6: ; %atomicrmw.phi +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics v2f16, with aa+av cases +;--------------------------------------------------------------------- + +define void @flat_atomic_fadd_v2f16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB139_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB139_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fadd_v2f16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB140_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB140_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB141_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB142_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB142_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX950-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB143_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX90A-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB144_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB144_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX950-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB145_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX90A-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB146_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_min_f16 v4, v2, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB146_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB147_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmaximum_v2f16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB148_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB148_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB149_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fminimum_v2f16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB150_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB150_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics v2bf16, with aa+av cases +;--------------------------------------------------------------------- + +define void @flat_atomic_fadd_v2bf16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB151_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fadd_v2bf16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB152_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB152_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB153_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB153_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB153_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fsub_v2bf16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB154_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB154_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB154_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB155_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB155_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB155_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmax_v2bf16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB156_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB156_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB156_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB157_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB157_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB157_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmin_v2bf16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB158_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB158_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB158_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB159_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB159_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB159_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmaximum_v2bf16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB160_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB160_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB160_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB161_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB161_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB161_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fminimum_v2bf16_ret_av_av(ptr %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB162_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2 +; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB162_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB162_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics i32, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @flat_atomic_xchg_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_xchg_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_add_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_add_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_add_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_add_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_add_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_add_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_add v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw add ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_sub_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_sub_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_sub_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_sub_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_sub_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_sub_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw sub ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_and_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_and_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_and_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_and_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_and_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_and_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_and v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw and ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_nand_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB171_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_nand_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB171_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB171_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_nand_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2 +; GFX90A-NEXT: v_not_b32_e32 v4, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB172_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_nand_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB172_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw nand ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_or_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_or_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_or_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_or_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_or_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_or_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_or v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw or ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_xor_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_xor_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_max_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_max_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_max_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_max_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_max_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_max_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_smax v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw max ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_min_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_min_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_min_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_min_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_min_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_min_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_smin v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw min ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_umax_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umax_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umax_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_umax_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umax_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umax_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_umax v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw umax ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_umin_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umin_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umin_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_umin_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umin_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umin_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_umin v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw umin ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_uinc_wrap_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_uinc_wrap_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_uinc_wrap_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw uinc_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_udec_wrap_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_udec_wrap_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, s16 +; GFX90A-NEXT: v_mov_b32_e32 v1, s17 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_udec_wrap_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw udec_wrap ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_usub_cond_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_cond_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB189_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_cond_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB189_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB189_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_usub_cond_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_cond_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB190_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_cond_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v3 +; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB190_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw usub_cond ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_sat_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB191_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_sat_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB191_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB191_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_sat_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB192_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_sat_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB192_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw usub_sat ptr %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics i64, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @flat_atomic_xchg_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB193_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB193_3 +; GFX90A-NEXT: s_branch .LBB193_4 +; GFX90A-NEXT: .LBB193_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB193_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword a0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword a1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB193_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB193_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX950-NEXT: s_cbranch_execz .LBB193_3 +; GFX950-NEXT: s_branch .LBB193_4 +; GFX950-NEXT: .LBB193_2: +; GFX950-NEXT: ; implicit-def: $agpr2_agpr3 +; GFX950-NEXT: .LBB193_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 a[2:3], off, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 off, a[0:1], s0 +; GFX950-NEXT: .LBB193_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_xchg_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xchg_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB194_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB194_3 +; GFX90A-NEXT: s_branch .LBB194_4 +; GFX90A-NEXT: .LBB194_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB194_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB194_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xchg_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB194_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB194_3 +; GFX950-NEXT: s_branch .LBB194_4 +; GFX950-NEXT: .LBB194_2: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB194_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB194_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_add_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_add_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB195_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB195_3 +; GFX90A-NEXT: s_branch .LBB195_4 +; GFX90A-NEXT: .LBB195_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB195_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB195_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB195_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB195_3 +; GFX950-NEXT: s_branch .LBB195_4 +; GFX950-NEXT: .LBB195_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB195_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB195_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_add_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_add_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB196_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB196_3 +; GFX90A-NEXT: s_branch .LBB196_4 +; GFX90A-NEXT: .LBB196_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB196_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB196_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_add_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB196_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_add_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB196_3 +; GFX950-NEXT: s_branch .LBB196_4 +; GFX950-NEXT: .LBB196_2: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB196_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB196_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw add ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_sub_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_sub_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB197_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB197_3 +; GFX90A-NEXT: s_branch .LBB197_4 +; GFX90A-NEXT: .LBB197_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB197_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB197_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB197_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB197_3 +; GFX950-NEXT: s_branch .LBB197_4 +; GFX950-NEXT: .LBB197_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB197_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB197_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_sub_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_sub_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB198_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_sub_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB198_3 +; GFX90A-NEXT: s_branch .LBB198_4 +; GFX90A-NEXT: .LBB198_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB198_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB198_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_sub_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB198_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB198_3 +; GFX950-NEXT: s_branch .LBB198_4 +; GFX950-NEXT: .LBB198_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB198_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB198_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw sub ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_and_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_and_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB199_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB199_3 +; GFX90A-NEXT: s_branch .LBB199_4 +; GFX90A-NEXT: .LBB199_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB199_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB199_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB199_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB199_3 +; GFX950-NEXT: s_branch .LBB199_4 +; GFX950-NEXT: .LBB199_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB199_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB199_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_and_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_and_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB200_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_and_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB200_3 +; GFX90A-NEXT: s_branch .LBB200_4 +; GFX90A-NEXT: .LBB200_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB200_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB200_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_and_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB200_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB200_3 +; GFX950-NEXT: s_branch .LBB200_4 +; GFX950-NEXT: .LBB200_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB200_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB200_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw and ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_nand_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_nand_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_cbranch_vccz .LBB201_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB201_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_and_b32_e32 v8, v2, v4 +; GFX90A-NEXT: v_not_b32_e32 v1, v0 +; GFX90A-NEXT: v_not_b32_e32 v0, v8 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB201_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB201_6 +; GFX90A-NEXT: .LBB201_4: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_cbranch_execz .LBB201_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v4, v2, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_not_b32_e32 v2, v3 +; GFX90A-NEXT: v_not_b32_e32 v3, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB201_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_nand_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: s_cbranch_vccz .LBB201_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB201_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX950-NEXT: v_and_b32_e32 v8, v2, v4 +; GFX950-NEXT: v_not_b32_e32 v1, v0 +; GFX950-NEXT: v_not_b32_e32 v0, v8 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB201_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB201_6 +; GFX950-NEXT: .LBB201_4: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_cbranch_execz .LBB201_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v4, v0, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_not_b32_e32 v3, v2 +; GFX950-NEXT: v_not_b32_e32 v2, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB201_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_nand_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_nand_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB202_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB202_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_and_b32_e32 v2, v9, v1 +; GFX90A-NEXT: v_and_b32_e32 v3, v8, v0 +; GFX90A-NEXT: v_not_b32_e32 v7, v2 +; GFX90A-NEXT: v_not_b32_e32 v6, v3 +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB202_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB202_6 +; GFX90A-NEXT: .LBB202_4: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_execz .LBB202_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX90A-NEXT: v_not_b32_e32 v0, v0 +; GFX90A-NEXT: v_not_b32_e32 v1, v1 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB202_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_nand_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB202_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB202_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_and_b32_e32 v2, v9, v1 +; GFX950-NEXT: v_and_b32_e32 v3, v8, v0 +; GFX950-NEXT: v_not_b32_e32 v7, v2 +; GFX950-NEXT: v_not_b32_e32 v6, v3 +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB202_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB202_6 +; GFX950-NEXT: .LBB202_4: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_cbranch_execz .LBB202_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX950-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_not_b32_e32 v1, v1 +; GFX950-NEXT: v_not_b32_e32 v0, v0 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB202_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw nand ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_or_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_or_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB203_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB203_3 +; GFX90A-NEXT: s_branch .LBB203_4 +; GFX90A-NEXT: .LBB203_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB203_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB203_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB203_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB203_3 +; GFX950-NEXT: s_branch .LBB203_4 +; GFX950-NEXT: .LBB203_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB203_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX950-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB203_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_or_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_or_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB204_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_or_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB204_3 +; GFX90A-NEXT: s_branch .LBB204_4 +; GFX90A-NEXT: .LBB204_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB204_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB204_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_or_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB204_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB204_3 +; GFX950-NEXT: s_branch .LBB204_4 +; GFX950-NEXT: .LBB204_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB204_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB204_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw or ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_xor_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB205_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB205_3 +; GFX90A-NEXT: s_branch .LBB205_4 +; GFX90A-NEXT: .LBB205_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB205_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v0, v4, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB205_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB205_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB205_3 +; GFX950-NEXT: s_branch .LBB205_4 +; GFX950-NEXT: .LBB205_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB205_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX950-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB205_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_xor_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_xor_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB206_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_xor_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB206_3 +; GFX90A-NEXT: s_branch .LBB206_4 +; GFX90A-NEXT: .LBB206_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB206_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB206_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_xor_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB206_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB206_3 +; GFX950-NEXT: s_branch .LBB206_4 +; GFX950-NEXT: .LBB206_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB206_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB206_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_max_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_max_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB207_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB207_3 +; GFX90A-NEXT: s_branch .LBB207_4 +; GFX90A-NEXT: .LBB207_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB207_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB207_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB207_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB207_3 +; GFX950-NEXT: s_branch .LBB207_4 +; GFX950-NEXT: .LBB207_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB207_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB207_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_max_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_max_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB208_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_smax_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB208_3 +; GFX90A-NEXT: s_branch .LBB208_4 +; GFX90A-NEXT: .LBB208_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB208_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB208_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_max_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB208_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB208_3 +; GFX950-NEXT: s_branch .LBB208_4 +; GFX950-NEXT: .LBB208_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB208_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB208_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw max ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_min_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_min_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB209_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB209_3 +; GFX90A-NEXT: s_branch .LBB209_4 +; GFX90A-NEXT: .LBB209_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB209_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB209_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB209_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB209_3 +; GFX950-NEXT: s_branch .LBB209_4 +; GFX950-NEXT: .LBB209_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB209_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB209_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_min_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_min_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB210_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_smin_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB210_3 +; GFX90A-NEXT: s_branch .LBB210_4 +; GFX90A-NEXT: .LBB210_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB210_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_le_i64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB210_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_min_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB210_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB210_3 +; GFX950-NEXT: s_branch .LBB210_4 +; GFX950-NEXT: .LBB210_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB210_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB210_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw min ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_umax_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umax_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB211_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB211_3 +; GFX90A-NEXT: s_branch .LBB211_4 +; GFX90A-NEXT: .LBB211_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB211_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB211_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB211_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB211_3 +; GFX950-NEXT: s_branch .LBB211_4 +; GFX950-NEXT: .LBB211_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB211_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB211_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_umax_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umax_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB212_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_umax_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB212_3 +; GFX90A-NEXT: s_branch .LBB212_4 +; GFX90A-NEXT: .LBB212_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB212_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB212_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umax_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB212_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB212_3 +; GFX950-NEXT: s_branch .LBB212_4 +; GFX950-NEXT: .LBB212_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB212_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB212_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw umax ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_umin_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umin_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB213_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB213_3 +; GFX90A-NEXT: s_branch .LBB213_4 +; GFX90A-NEXT: .LBB213_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB213_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB213_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB213_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB213_3 +; GFX950-NEXT: s_branch .LBB213_4 +; GFX950-NEXT: .LBB213_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB213_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB213_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_umin_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_umin_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB214_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_umin_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB214_3 +; GFX90A-NEXT: s_branch .LBB214_4 +; GFX90A-NEXT: .LBB214_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB214_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_le_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB214_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_umin_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB214_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB214_3 +; GFX950-NEXT: s_branch .LBB214_4 +; GFX950-NEXT: .LBB214_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB214_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB214_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw umin ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB215_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB215_3 +; GFX90A-NEXT: s_branch .LBB215_4 +; GFX90A-NEXT: .LBB215_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB215_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB215_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB215_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB215_3 +; GFX950-NEXT: s_branch .LBB215_4 +; GFX950-NEXT: .LBB215_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB215_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, 1 +; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB215_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB216_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB216_3 +; GFX90A-NEXT: s_branch .LBB216_4 +; GFX90A-NEXT: .LBB216_2: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB216_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, 1, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX90A-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc +; GFX90A-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB216_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_uinc_wrap_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB216_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB216_3 +; GFX950-NEXT: s_branch .LBB216_4 +; GFX950-NEXT: .LBB216_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB216_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB216_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw uinc_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_udec_wrap_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB217_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB217_3 +; GFX90A-NEXT: s_branch .LBB217_4 +; GFX90A-NEXT: .LBB217_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB217_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: .LBB217_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB217_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB217_3 +; GFX950-NEXT: s_branch .LBB217_4 +; GFX950-NEXT: .LBB217_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB217_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[0:1] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 0, -1 +; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s2 +; GFX950-NEXT: .LBB217_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_udec_wrap_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB218_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB218_3 +; GFX90A-NEXT: s_branch .LBB218_4 +; GFX90A-NEXT: .LBB218_2: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB218_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, -1, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v3, vcc +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX90A-NEXT: v_cmp_gt_u64_e64 s[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB218_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_udec_wrap_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB218_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB218_3 +; GFX950-NEXT: s_branch .LBB218_4 +; GFX950-NEXT: .LBB218_2: +; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-NEXT: .LBB218_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s2 +; GFX950-NEXT: .LBB218_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw udec_wrap ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_usub_cond_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_cond_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_cbranch_vccz .LBB219_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB219_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB219_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB219_6 +; GFX90A-NEXT: .LBB219_4: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_cbranch_execz .LBB219_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, v0, v4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v1, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v1, v6, vcc +; GFX90A-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB219_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_cond_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: s_cbranch_vccz .LBB219_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB219_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB219_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB219_6 +; GFX950-NEXT: .LBB219_4: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_cbranch_execz .LBB219_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB219_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_usub_cond_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_cond_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB220_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB220_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB220_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB220_6 +; GFX90A-NEXT: .LBB220_4: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_execz .LBB220_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, v2, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v1, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB220_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_cond_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB220_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB220_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB220_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB220_6 +; GFX950-NEXT: .LBB220_4: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_cbranch_execz .LBB220_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v1, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB220_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw usub_cond ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @flat_atomic_usub_sat_i64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_sat_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_cbranch_vccz .LBB221_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB221_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB221_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB221_6 +; GFX90A-NEXT: .LBB221_4: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_cbranch_execz .LBB221_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: .LBB221_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_sat_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: s_cbranch_vccz .LBB221_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB221_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB221_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB221_6 +; GFX950-NEXT: .LBB221_4: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_cbranch_execz .LBB221_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB221_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @flat_atomic_usub_sat_i64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_usub_sat_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB222_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB222_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB222_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB222_6 +; GFX90A-NEXT: .LBB222_4: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_execz .LBB222_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB222_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_usub_sat_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB222_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB222_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB222_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB222_6 +; GFX950-NEXT: .LBB222_4: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_cbranch_execz .LBB222_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB222_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw usub_sat ptr %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics f32, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @flat_atomic_fadd_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 40 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_cbranch_vccz .LBB223_3 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_vccz .LBB223_4 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX90A-NEXT: s_cbranch_execz .LBB223_5 +; GFX90A-NEXT: s_branch .LBB223_6 +; GFX90A-NEXT: .LBB223_3: +; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: s_branch .LBB223_7 +; GFX90A-NEXT: .LBB223_4: +; GFX90A-NEXT: ; implicit-def: $agpr0 +; GFX90A-NEXT: .LBB223_5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s6 +; GFX90A-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GFX90A-NEXT: .LBB223_6: ; %Flow1 +; GFX90A-NEXT: s_cbranch_execnz .LBB223_8 +; GFX90A-NEXT: .LBB223_7: ; %atomicrmw.shared +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: ds_add_rtn_f32 v0, v1, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: .LBB223_8: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fadd_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 40 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB224_3 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_vccz .LBB224_4 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v0, s[4:5] glc +; GFX90A-NEXT: s_cbranch_execz .LBB224_5 +; GFX90A-NEXT: s_branch .LBB224_6 +; GFX90A-NEXT: .LBB224_3: +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_branch .LBB224_7 +; GFX90A-NEXT: .LBB224_4: +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: .LBB224_5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s6 +; GFX90A-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f32_e32 v3, v1, v0 +; GFX90A-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GFX90A-NEXT: .LBB224_6: ; %Flow1 +; GFX90A-NEXT: s_cbranch_execnz .LBB224_8 +; GFX90A-NEXT: .LBB224_7: ; %atomicrmw.shared +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: ds_add_rtn_f32 v1, v1, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB224_8: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fsub_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB225_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB225_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB225_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fsub_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB226_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fmax_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB227_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fmax_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB228_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fmin_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB229_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fmin_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB230_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB231_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB232_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB233_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB234_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics f64, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @flat_atomic_fadd_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB235_3 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_vccz .LBB235_4 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB235_5 +; GFX90A-NEXT: s_branch .LBB235_6 +; GFX90A-NEXT: .LBB235_3: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_branch .LBB235_7 +; GFX90A-NEXT: .LBB235_4: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB235_5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB235_6: ; %Flow1 +; GFX90A-NEXT: s_cbranch_execnz .LBB235_8 +; GFX90A-NEXT: .LBB235_7: ; %atomicrmw.shared +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: .LBB235_8: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB235_3 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: s_cbranch_vccz .LBB235_4 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB235_5 +; GFX950-NEXT: s_branch .LBB235_6 +; GFX950-NEXT: .LBB235_3: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_branch .LBB235_7 +; GFX950-NEXT: .LBB235_4: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB235_5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; GFX950-NEXT: .LBB235_6: ; %Flow1 +; GFX950-NEXT: s_cbranch_execnz .LBB235_8 +; GFX950-NEXT: .LBB235_7: ; %atomicrmw.shared +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: ds_add_rtn_f64 v[0:1], v2, v[0:1] +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: .LBB235_8: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fadd_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_shared_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB236_3 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_vccz .LBB236_4 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[4:5] glc +; GFX90A-NEXT: s_cbranch_execz .LBB236_5 +; GFX90A-NEXT: s_branch .LBB236_6 +; GFX90A-NEXT: .LBB236_3: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_branch .LBB236_7 +; GFX90A-NEXT: .LBB236_4: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: .LBB236_5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s6, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s6 +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB236_6: ; %Flow1 +; GFX90A-NEXT: s_cbranch_execnz .LBB236_8 +; GFX90A-NEXT: .LBB236_7: ; %atomicrmw.shared +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s4 +; GFX90A-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB236_8: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB236_3 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: s_cbranch_vccz .LBB236_4 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 +; GFX950-NEXT: s_cbranch_execz .LBB236_5 +; GFX950-NEXT: s_branch .LBB236_6 +; GFX950-NEXT: .LBB236_3: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_branch .LBB236_7 +; GFX950-NEXT: .LBB236_4: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB236_5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX950-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; GFX950-NEXT: .LBB236_6: ; %Flow1 +; GFX950-NEXT: s_cbranch_execnz .LBB236_8 +; GFX950-NEXT: .LBB236_7: ; %atomicrmw.shared +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: .LBB236_8: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_cbranch_vccz .LBB237_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB237_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB237_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB237_6 +; GFX90A-NEXT: .LBB237_4: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_cbranch_execz .LBB237_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB237_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: s_cbranch_vccz .LBB237_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB237_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB237_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB237_6 +; GFX950-NEXT: .LBB237_4: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_cbranch_execz .LBB237_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_add_f64 v[2:3], v[0:1], -v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB237_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB238_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: .LBB238_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB238_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB238_6 +; GFX90A-NEXT: .LBB238_4: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_execz .LBB238_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: buffer_load_dword v2, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] +; GFX90A-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB238_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB238_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: .LBB238_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[6:9] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB238_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB238_6 +; GFX950-NEXT: .LBB238_4: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_cbranch_execz .LBB238_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB238_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fmax_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB239_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB239_3 +; GFX90A-NEXT: s_branch .LBB239_4 +; GFX90A-NEXT: .LBB239_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB239_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB239_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB239_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB239_3 +; GFX950-NEXT: s_branch .LBB239_4 +; GFX950-NEXT: .LBB239_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB239_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB239_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fmax_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB240_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB240_3 +; GFX90A-NEXT: s_branch .LBB240_4 +; GFX90A-NEXT: .LBB240_2: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB240_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB240_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB240_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB240_3 +; GFX950-NEXT: s_branch .LBB240_4 +; GFX950-NEXT: .LBB240_2: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB240_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB240_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fmin_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: s_cbranch_vccz .LBB241_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_cbranch_execz .LBB241_3 +; GFX90A-NEXT: s_branch .LBB241_4 +; GFX90A-NEXT: .LBB241_2: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: .LBB241_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX90A-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB241_4: ; %atomicrmw.end +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: s_cbranch_vccz .LBB241_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_cbranch_execz .LBB241_3 +; GFX950-NEXT: s_branch .LBB241_4 +; GFX950-NEXT: .LBB241_2: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: .LBB241_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB241_4: ; %atomicrmw.end +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fmin_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB242_2 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cbranch_execz .LBB242_3 +; GFX90A-NEXT: s_branch .LBB242_4 +; GFX90A-NEXT: .LBB242_2: +; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX90A-NEXT: .LBB242_3: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[2:3] +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB242_4: ; %atomicrmw.end +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB242_2 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_cbranch_execz .LBB242_3 +; GFX950-NEXT: s_branch .LBB242_4 +; GFX950-NEXT: .LBB242_2: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: .LBB242_3: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB242_4: ; %atomicrmw.end +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_cbranch_vccz .LBB243_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX90A-NEXT: .LBB243_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB243_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB243_6 +; GFX90A-NEXT: .LBB243_4: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_cbranch_execz .LBB243_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB243_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: s_cbranch_vccz .LBB243_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: .LBB243_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB243_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB243_6 +; GFX950-NEXT: .LBB243_4: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_cbranch_execz .LBB243_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB243_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB244_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: .LBB244_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB244_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB244_6 +; GFX90A-NEXT: .LBB244_4: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_execz .LBB244_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB244_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB244_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: .LBB244_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX950-NEXT: v_max_f64 v[2:3], v[10:11], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB244_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB244_6 +; GFX950-NEXT: .LBB244_4: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_cbranch_execz .LBB244_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[4:5], v[2:3], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB244_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_cbranch_vccz .LBB245_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX90A-NEXT: .LBB245_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB245_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB245_6 +; GFX90A-NEXT: .LBB245_4: +; GFX90A-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX90A-NEXT: s_cbranch_execz .LBB245_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v0, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v1, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc +; GFX90A-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB245_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: s_cbranch_vccz .LBB245_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: .LBB245_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB245_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB245_6 +; GFX950-NEXT: .LBB245_4: +; GFX950-NEXT: ; implicit-def: $agpr0_agpr1 +; GFX950-NEXT: s_cbranch_execz .LBB245_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[0:1], off, s0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[2:3], s0 +; GFX950-NEXT: .LBB245_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_add_u32 s4, s16, 0x50 +; GFX90A-NEXT: s_addc_u32 s5, s17, 0 +; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base +; GFX90A-NEXT: s_cmp_eq_u32 s5, s7 +; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_cbranch_vccz .LBB246_4 +; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: .LBB246_2: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB246_2 +; GFX90A-NEXT: ; %bb.3: ; %Flow +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_branch .LBB246_6 +; GFX90A-NEXT: .LBB246_4: +; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX90A-NEXT: s_cbranch_execz .LBB246_6 +; GFX90A-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX90A-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX90A-NEXT: s_cselect_b32 s4, s4, -1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s4 +; GFX90A-NEXT: buffer_load_dword v2, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_load_dword v3, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GFX90A-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GFX90A-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4 +; GFX90A-NEXT: .LBB246_6: ; %atomicrmw.phi +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_cbranch_vccz .LBB246_4 +; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: .LBB246_2: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[2:3] +; GFX950-NEXT: v_min_f64 v[2:3], v[10:11], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v3, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX950-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[4:5], v[8:11] sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[10:11] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB246_2 +; GFX950-NEXT: ; %bb.3: ; %Flow +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_branch .LBB246_6 +; GFX950-NEXT: .LBB246_4: +; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-NEXT: s_cbranch_execz .LBB246_6 +; GFX950-NEXT: ; %bb.5: ; %atomicrmw.private +; GFX950-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[4:5], v[2:3], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc +; GFX950-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-NEXT: .LBB246_6: ; %atomicrmw.phi +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics v2f16, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @flat_atomic_fadd_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB247_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB247_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fadd_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB248_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB248_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB249_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB249_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB249_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB250_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB250_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB251_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB252_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB252_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB252_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB253_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB254_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3 +; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB254_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_min_f16 v4, v2, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB254_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB255_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB255_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB255_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB256_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB256_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB256_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB257_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_min_f16 v0, v1, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v0, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v5, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v0, v0, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v3, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB257_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB257_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB258_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB258_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v2, v[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3 +; GFX950-NEXT: flat_atomic_cmpswap v2, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB258_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics v2bf16, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @flat_atomic_fadd_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB259_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fadd_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB260_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB260_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB261_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: .LBB261_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB261_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB262_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB262_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fsub_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB262_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB263_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: .LBB263_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB263_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB264_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB264_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB264_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB265_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: .LBB265_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB265_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB266_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB266_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB266_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB267_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v8, v0, v4 +; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: .LBB267_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_maximum3_f32 v0, v0, v4, v4 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB267_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB268_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB268_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmaximum_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB268_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB269_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v8, v0, v4 +; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v0, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v0, v7, v0, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: .LBB269_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_minimum3_f32 v0, v0, v4, v4 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB269_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 { +; GFX90A-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[16:17], s[16:17] op_sel:[0,1] +; GFX90A-NEXT: .LBB270_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2 +; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 +; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB270_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fminimum_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: flat_load_dword v4, v[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX950-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB270_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" } + +!0 = !{} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll index 7c36642ce7a3b..c98fff96d7b8a 100644 --- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll +++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX90A %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefixes=CHECK,GFX950 %s ;--------------------------------------------------------------------- ; xchg i32 cases @@ -7,375 +8,626 @@ ; Input and result use AGPR define void @global_atomic_xchg_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_a_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=a"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm "; use $0", "a"(i32 %result) ret void } ; Input is AGPR, result used as VGPR. define void @global_atomic_xchg_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_a_v: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=a"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm "; use $0", "v"(i32 %result) ret void } ; Input is VGPR, result used as AGPR define void @global_atomic_xchg_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_v_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=v"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm "; use $0", "a"(i32 %result) ret void } ; Input is AV, result also used as AV define void @global_atomic_xchg_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_av_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm "; use $0", "^VA"(i32 %result) ret void } ; Input is AV, used as v define void @global_atomic_xchg_i32_ret_av_v(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_av_v: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm "; use $0", "v"(i32 %result) ret void } ; Input is AV, used as a define void @global_atomic_xchg_i32_ret_av_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_av_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm "; use $0", "a"(i32 %result) ret void } ; Input is a, result used as AV define void @global_atomic_xchg_i32_ret_a_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_a_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=a"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm "; use $0", "^VA"(i32 %result) ret void } ; Input is v, result used as AV define void @global_atomic_xchg_i32_ret_v_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_v_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=v"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm "; use $0", "^VA"(i32 %result) ret void } define void @global_atomic_xchg_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[0:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_atomic_swap v0, v[0:1], v2, off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; CHECK-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse -; CHECK-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_ret_av_av_no_agprs: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: global_atomic_swap v0, v[0:1], v2, off offset:40 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"() %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0 %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1 - %result = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1) call void asm "; use $0", "^VA"(i32 %result) ret void } define void @global_atomic_xchg_i32_noret_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_noret_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v[0:1], a0, off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v[0:1], a0, off offset:40 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v[0:1], a0, off offset:40 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=a"() - %unused = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %unused = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst ret void } define void @global_atomic_xchg_i32_noret_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i32_noret_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap v[0:1], v2, off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i32_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap v[0:1], v2, off offset:40 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap v[0:1], v2, off offset:40 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() - %unused = atomicrmw xchg ptr addrspace(1) %ptr, i32 %data seq_cst + %unused = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data seq_cst ret void } @@ -385,221 +637,368 @@ define void @global_atomic_xchg_i32_noret_av(ptr addrspace(1) %ptr) #0 { ; Input and result use AGPR define void @global_atomic_xchg_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_ret_a_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data seq_cst call void asm "; use $0", "a"(i64 %result) ret void } ; Input is AGPR, result used as VGPR. define void @global_atomic_xchg_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_ret_a_v: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data seq_cst call void asm "; use $0", "v"(i64 %result) ret void } ; Input is VGPR, result used as AGPR define void @global_atomic_xchg_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_ret_v_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=v"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data seq_cst call void asm "; use $0", "a"(i64 %result) ret void } ; Input is AV, result also used as AV define void @global_atomic_xchg_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_ret_av_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=^VA"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data seq_cst call void asm "; use $0", "^VA"(i64 %result) ret void } ; Input is AV, used as v define void @global_atomic_xchg_i64_ret_av_v(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_ret_av_v: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=^VA"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data seq_cst call void asm "; use $0", "v"(i64 %result) ret void } ; Input is AV, used as a define void @global_atomic_xchg_i64_ret_av_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_ret_av_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=^VA"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data seq_cst call void asm "; use $0", "a"(i64 %result) ret void } ; Input is a, result used as AV define void @global_atomic_xchg_i64_ret_a_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_ret_a_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data seq_cst call void asm "; use $0", "^VA"(i64 %result) ret void } ; Input is v, result used as AV define void @global_atomic_xchg_i64_ret_v_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_ret_v_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=v"() - %result = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data seq_cst call void asm "; use $0", "^VA"(i64 %result) ret void } define void @global_atomic_xchg_i64_noret_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_noret_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], a[0:1], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], a[0:1], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], a[0:1], off sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() %unused = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst @@ -607,18 +1006,30 @@ define void @global_atomic_xchg_i64_noret_a(ptr addrspace(1) %ptr) #0 { } define void @global_atomic_xchg_i64_noret_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xchg_i64_noret_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xchg_i64_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v[2:3], off sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=^VA"() %unused = atomicrmw xchg ptr addrspace(1) %ptr, i64 %data seq_cst @@ -631,36 +1042,66 @@ define void @global_atomic_xchg_i64_noret_av(ptr addrspace(1) %ptr) #0 { ; Input and result use AGPR define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_a_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v3, v[0:1], off -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB21_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB21_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB21_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB21_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=a"() %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -670,35 +1111,64 @@ define void @global_atomic_xor_expansion_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { ; Input is AGPR, result used as VGPR. define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_a_v: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v2, v[0:1], off -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB22_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v5, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB22_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB22_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=a"() %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -708,35 +1178,64 @@ define void @global_atomic_xor_expansion_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { ; Input is VGPR, result used as AGPR define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_v_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v3, v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v4 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB23_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB23_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB23_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB23_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=v"() %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -746,34 +1245,62 @@ define void @global_atomic_xor_expansion_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { ; Input is AV, result also used as AV define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_av_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v2, v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v3 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB24_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v5, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB24_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB24_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB24_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -783,34 +1310,62 @@ define void @global_atomic_xor_expansion_i32_ret_av_av(ptr addrspace(1) %ptr) #0 ; Input is AV, used as v define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_av_v: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v2, v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v3 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB25_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v5, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB25_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB25_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB25_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB25_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -820,35 +1375,64 @@ define void @global_atomic_xor_expansion_i32_ret_av_v(ptr addrspace(1) %ptr) #0 ; Input is AV, used as a define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_av_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v3, v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v4 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB26_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB26_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB26_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB26_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB26_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -858,35 +1442,64 @@ define void @global_atomic_xor_expansion_i32_ret_av_a(ptr addrspace(1) %ptr) #0 ; Input is a, result used as AV define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_a_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v2, v[0:1], off -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB27_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v5, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB27_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB27_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB27_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=a"() %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -896,34 +1509,62 @@ define void @global_atomic_xor_expansion_i32_ret_a_av(ptr addrspace(1) %ptr) #0 ; Input is v, result used as AV define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_v_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v2, v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v3 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB28_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v5, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v5, v3 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB28_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v2 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB28_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB28_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_xor_b32_e32 v4, v5, v3 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB28_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=v"() %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -932,146 +1573,286 @@ define void @global_atomic_xor_expansion_i32_ret_v_av(ptr addrspace(1) %ptr) #0 } define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: v_accvgpr_write_b32 a33, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a32, v0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[0:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v1 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v2 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v3 -; CHECK-NEXT: v_accvgpr_write_b32 a4, v4 -; CHECK-NEXT: v_accvgpr_write_b32 a5, v5 -; CHECK-NEXT: v_accvgpr_write_b32 a6, v6 -; CHECK-NEXT: v_accvgpr_write_b32 a7, v7 -; CHECK-NEXT: v_accvgpr_write_b32 a8, v8 -; CHECK-NEXT: v_accvgpr_write_b32 a9, v9 -; CHECK-NEXT: v_accvgpr_write_b32 a10, v10 -; CHECK-NEXT: v_accvgpr_write_b32 a11, v11 -; CHECK-NEXT: v_accvgpr_write_b32 a12, v12 -; CHECK-NEXT: v_accvgpr_write_b32 a13, v13 -; CHECK-NEXT: v_accvgpr_write_b32 a14, v14 -; CHECK-NEXT: v_accvgpr_write_b32 a15, v15 -; CHECK-NEXT: v_accvgpr_write_b32 a16, v16 -; CHECK-NEXT: v_accvgpr_write_b32 a17, v17 -; CHECK-NEXT: v_accvgpr_write_b32 a18, v18 -; CHECK-NEXT: v_accvgpr_write_b32 a19, v19 -; CHECK-NEXT: v_accvgpr_write_b32 a20, v20 -; CHECK-NEXT: v_accvgpr_write_b32 a21, v21 -; CHECK-NEXT: v_accvgpr_write_b32 a22, v22 -; CHECK-NEXT: v_accvgpr_write_b32 a23, v23 -; CHECK-NEXT: v_accvgpr_write_b32 a24, v24 -; CHECK-NEXT: v_accvgpr_write_b32 a25, v25 -; CHECK-NEXT: v_accvgpr_write_b32 a26, v26 -; CHECK-NEXT: v_accvgpr_write_b32 a27, v27 -; CHECK-NEXT: v_accvgpr_write_b32 a28, v28 -; CHECK-NEXT: v_accvgpr_write_b32 a29, v29 -; CHECK-NEXT: v_accvgpr_write_b32 a30, v30 -; CHECK-NEXT: v_accvgpr_write_b32 a31, v31 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a32 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a33 -; CHECK-NEXT: global_load_dword v1, v[4:5], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB29_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v3, v1 -; CHECK-NEXT: v_xor_b32_e32 v2, v3, v0 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v1, v[4:5], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB29_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a32, v1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 -; CHECK-NEXT: v_accvgpr_read_b32 v1, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a2 -; CHECK-NEXT: v_accvgpr_read_b32 v3, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v4, a4 -; CHECK-NEXT: v_accvgpr_read_b32 v5, a5 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a6 -; CHECK-NEXT: v_accvgpr_read_b32 v7, a7 -; CHECK-NEXT: v_accvgpr_read_b32 v8, a8 -; CHECK-NEXT: v_accvgpr_read_b32 v9, a9 -; CHECK-NEXT: v_accvgpr_read_b32 v10, a10 -; CHECK-NEXT: v_accvgpr_read_b32 v11, a11 -; CHECK-NEXT: v_accvgpr_read_b32 v12, a12 -; CHECK-NEXT: v_accvgpr_read_b32 v13, a13 -; CHECK-NEXT: v_accvgpr_read_b32 v14, a14 -; CHECK-NEXT: v_accvgpr_read_b32 v15, a15 -; CHECK-NEXT: v_accvgpr_read_b32 v16, a16 -; CHECK-NEXT: v_accvgpr_read_b32 v17, a17 -; CHECK-NEXT: v_accvgpr_read_b32 v18, a18 -; CHECK-NEXT: v_accvgpr_read_b32 v19, a19 -; CHECK-NEXT: v_accvgpr_read_b32 v20, a20 -; CHECK-NEXT: v_accvgpr_read_b32 v21, a21 -; CHECK-NEXT: v_accvgpr_read_b32 v22, a22 -; CHECK-NEXT: v_accvgpr_read_b32 v23, a23 -; CHECK-NEXT: v_accvgpr_read_b32 v24, a24 -; CHECK-NEXT: v_accvgpr_read_b32 v25, a25 -; CHECK-NEXT: v_accvgpr_read_b32 v26, a26 -; CHECK-NEXT: v_accvgpr_read_b32 v27, a27 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a28 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a29 -; CHECK-NEXT: v_accvgpr_read_b32 v30, a30 -; CHECK-NEXT: v_accvgpr_read_b32 v31, a31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[0:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a32 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a33, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: v_accvgpr_write_b32 a33, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a32 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a33 +; GFX90A-NEXT: global_load_dword v1, v[4:5], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v0 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v1, v[4:5], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB29_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a32, v1 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a32 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_load_dword a33, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_ret_av_av_no_agprs: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_dword off, v40, s32 offset:68 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v41, s32 offset:64 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v42, s32 offset:60 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v43, s32 offset:56 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v44, s32 offset:52 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v45, s32 offset:48 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v46, s32 offset:44 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v47, s32 offset:40 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v56, s32 offset:36 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v57, s32 offset:32 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v58, s32 offset:28 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v59, s32 offset:24 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v60, s32 offset:20 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v61, s32 offset:16 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v62, s32 offset:12 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, v63, s32 offset:8 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a32, s32 offset:4 ; 4-byte Folded Spill +; GFX950-NEXT: scratch_store_dword off, a33, s32 ; 4-byte Folded Spill +; GFX950-NEXT: v_accvgpr_write_b32 a33, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a32, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX950-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX950-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX950-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX950-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX950-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX950-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX950-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX950-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX950-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX950-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX950-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX950-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX950-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX950-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX950-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX950-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX950-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX950-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX950-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX950-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a32 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a33 +; GFX950-NEXT: global_load_dword v1, v[4:5], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB29_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v0 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v1, v[4:5], v[2:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB29_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a32, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX950-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX950-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX950-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX950-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX950-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX950-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX950-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX950-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX950-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX950-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX950-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX950-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX950-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX950-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX950-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX950-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX950-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX950-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX950-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX950-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX950-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX950-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX950-NEXT: v_accvgpr_read_b32 v31, a31 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a32 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_load_dword a33, off, s32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword a32, off, s32 offset:4 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v63, off, s32 offset:8 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v62, off, s32 offset:12 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v61, off, s32 offset:16 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v60, off, s32 offset:20 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:24 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:28 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:32 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:36 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:40 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:44 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:48 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:52 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:56 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:60 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:64 ; 4-byte Folded Reload +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:68 ; 4-byte Folded Reload +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"() @@ -1084,32 +1865,58 @@ define void @global_atomic_xor_expansion_i32_ret_av_av_no_agprs(ptr addrspace(1) } define void @global_atomic_xor_expansion_i32_noret_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_noret_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v3, v[0:1], off -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v4, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB30_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB30_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB30_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB30_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=a"() %unused = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -1117,31 +1924,56 @@ define void @global_atomic_xor_expansion_i32_noret_a(ptr addrspace(1) %ptr) #0 { } define void @global_atomic_xor_expansion_i32_noret_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i32_noret_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dword v3, v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v4 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB31_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v2, v3, v4 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB31_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i32_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i32_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v3, v4 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB31_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i32 asm "; def $0", "=^VA"() %unused = atomicrmw xor ptr addrspace(1) %ptr, i32 %data seq_cst @@ -1154,39 +1986,72 @@ define void @global_atomic_xor_expansion_i32_noret_av(ptr addrspace(1) %ptr) #0 ; Input and result use AGPR define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_a_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v7, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB32_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7 -; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 -; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB32_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB32_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1196,37 +2061,68 @@ define void @global_atomic_xor_expansion_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { ; Input is AGPR, result used as VGPR. define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_a_v: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB33_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3 -; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB33_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB33_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB33_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1236,37 +2132,68 @@ define void @global_atomic_xor_expansion_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { ; Input is VGPR, result used as AGPR define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_v_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[6:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB34_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7 -; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 -; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB34_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[6:7] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB34_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=v"() %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1276,35 +2203,64 @@ define void @global_atomic_xor_expansion_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { ; Input is AV, result also used as AV define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_av_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB35_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3 -; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB35_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB35_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB35_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=^VA"() %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1314,35 +2270,64 @@ define void @global_atomic_xor_expansion_i64_ret_av_av(ptr addrspace(1) %ptr) #0 ; Input is AV, used as v define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_av_v: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB36_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3 -; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB36_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB36_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB36_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=^VA"() %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1352,37 +2337,68 @@ define void @global_atomic_xor_expansion_i64_ret_av_v(ptr addrspace(1) %ptr) #0 ; Input is AV, used as a define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_av_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[6:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB37_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7 -; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v2 -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_accvgpr_write_b32 a1, v3 -; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB37_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB37_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[6:7] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB37_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=^VA"() %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1392,37 +2408,68 @@ define void @global_atomic_xor_expansion_i64_ret_av_a(ptr addrspace(1) %ptr) #0 ; Input is a, result used as AV define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_a_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v3, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v2, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB38_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3 -; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB38_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB38_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB38_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1432,35 +2479,64 @@ define void @global_atomic_xor_expansion_i64_ret_a_av(ptr addrspace(1) %ptr) #0 ; Input is v, result used as AV define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_ret_v_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB39_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] -; CHECK-NEXT: v_xor_b32_e32 v5, v7, v3 -; CHECK-NEXT: v_xor_b32_e32 v4, v6, v2 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB39_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use v[4:5] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX90A-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB39_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=v"() %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1469,34 +2545,62 @@ define void @global_atomic_xor_expansion_i64_ret_v_av(ptr addrspace(1) %ptr) #0 } define void @global_atomic_xor_expansion_i64_noret_a(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_noret_a: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def a[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_accvgpr_read_b32 v7, a1 -; CHECK-NEXT: v_accvgpr_read_b32 v6, a0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: .LBB40_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7 -; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB40_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB40_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=a"() %unused = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst @@ -1504,36 +2608,12296 @@ define void @global_atomic_xor_expansion_i64_noret_a(ptr addrspace(1) %ptr) #0 { } define void @global_atomic_xor_expansion_i64_noret_av(ptr addrspace(1) %ptr) #0 { -; CHECK-LABEL: global_atomic_xor_expansion_i64_noret_av: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def v[6:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: .LBB41_1: ; %atomicrmw.start -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_xor_b32_e32 v3, v5, v7 -; CHECK-NEXT: v_xor_b32_e32 v2, v4, v6 -; CHECK-NEXT: buffer_wbl2 -; CHECK-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_invl2 -; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] -; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; CHECK-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] -; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_cbranch_execnz .LBB41_1 -; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end -; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX90A-LABEL: global_atomic_xor_expansion_i64_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[6:7] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX90A-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX90A-NEXT: buffer_wbl2 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_invl2 +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_expansion_i64_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[6:7] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v3, v5, v7 +; GFX950-NEXT: v_xor_b32_e32 v2, v4, v6 +; GFX950-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc0 sc1 +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB41_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 %data = call i64 asm "; def $0", "=^VA"() %unused = atomicrmw xor ptr addrspace(1) %ptr, i64 %data seq_cst ret void } +;--------------------------------------------------------------------- +; xor i32 cases with instruction +;--------------------------------------------------------------------- + +; Input and result use AGPR +define void @global_atomic_xor_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is AGPR, result used as VGPR. +define void @global_atomic_xor_i32_ret_a_v(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "v"(i32 %result) + ret void +} + +; Input is VGPR, result used as AGPR +define void @global_atomic_xor_i32_ret_v_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=v"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is AV, result also used as AV +define void @global_atomic_xor_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +; Input is AV, used as v +define void @global_atomic_xor_i32_ret_av_v(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "v"(i32 %result) + ret void +} + +; Input is AV, used as a +define void @global_atomic_xor_i32_ret_av_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +; Input is a, result used as AV +define void @global_atomic_xor_i32_ret_a_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +; Input is v, result used as AV +define void @global_atomic_xor_i32_ret_v_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=v"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_xor_i32_ret_av_av_no_agprs(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:31] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX90A-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_ret_av_av_no_agprs: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v60 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a16, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a17, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a18, v63 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[8:11], s32 offset:32 ; 16-byte Folded Spill +; GFX950-NEXT: scratch_store_dwordx4 off, v[12:15], s32 offset:48 ; 16-byte Folded Spill +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_write_b32 a19, v31 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a20, v30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a21, v29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a22, v28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: scratch_store_dwordx3 off, v[16:18], s32 offset:64 ; 12-byte Folded Spill +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: global_atomic_xor v0, v[0:1], v2, off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a31, v19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a27, v23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a28, v22 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a29, v21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a30, v20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a23, v27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a24, v26 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a25, v25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a26, v24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: scratch_load_dwordx4 v[0:3], off, s32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[8:11], off, s32 offset:32 ; 16-byte Folded Reload +; GFX950-NEXT: scratch_load_dwordx4 v[12:15], off, s32 offset:48 ; 16-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v19, a31 ; Reload Reuse +; GFX950-NEXT: scratch_load_dwordx3 v[16:18], off, s32 offset:64 ; 12-byte Folded Reload +; GFX950-NEXT: v_accvgpr_read_b32 v23, a27 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v22, a28 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v21, a29 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v20, a30 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v27, a23 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v26, a24 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v25, a25 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v24, a26 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_read_b32 v31, a19 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v30, a20 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v29, a21 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v28, a22 ; Reload Reuse +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:31] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_accvgpr_read_b32 v63, a18 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a17 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a16 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a3 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"() + %vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0 + %vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1 + %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1) + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_xor_i32_noret_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor v[0:1], a0, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v[0:1], a0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %unused = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_atomic_xor_i32_noret_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor v[0:1], v2, off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor v[0:1], v2, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %unused = atomicrmw xor ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +;--------------------------------------------------------------------- +; xor i64 cases with instruction +;--------------------------------------------------------------------- + +; Input and result use AGPR +define void @global_atomic_xor_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is AGPR, result used as VGPR. +define void @global_atomic_xor_i64_ret_a_v(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_ret_a_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_ret_a_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "v"(i64 %result) + ret void +} + +; Input is VGPR, result used as AGPR +define void @global_atomic_xor_i64_ret_v_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_ret_v_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_ret_v_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=v"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is AV, result also used as AV +define void @global_atomic_xor_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +; Input is AV, used as v +define void @global_atomic_xor_i64_ret_av_v(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_ret_av_v: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_ret_av_v: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "v"(i64 %result) + ret void +} + +; Input is AV, used as a +define void @global_atomic_xor_i64_ret_av_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_ret_av_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_ret_av_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +; Input is a, result used as AV +define void @global_atomic_xor_i64_ret_a_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_ret_a_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_ret_a_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +; Input is v, result used as AV +define void @global_atomic_xor_i64_ret_v_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_ret_v_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_ret_v_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=v"() + %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_xor_i64_noret_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_noret_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], a[0:1], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_noret_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], a[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %unused = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define void @global_atomic_xor_i64_noret_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_noret_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_noret_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: buffer_wbl2 sc1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: buffer_inv sc1 +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %unused = atomicrmw xor ptr addrspace(1) %ptr, i64 %data syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 + ret void +} + +;--------------------------------------------------------------------- +; other atomics i32, with aa+av cases +;--------------------------------------------------------------------- + +define void @global_atomic_add_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_add_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_add v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_add_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_add v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw add ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_add_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_add_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_add v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_add_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_add v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw add ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_sub_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_sub_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_sub_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw sub ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_sub_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_sub_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_sub_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw sub ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_and_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_and_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_and v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_and_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_and v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw and ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_and_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_and_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_and v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_and_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_and v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw and ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_nand_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, v3, v4 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB69_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_nand_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_bitop3_b32 v2, v3, v4, v3 bitop3:0x3f +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB69_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw nand ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_nand_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_nand_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_and_b32_e32 v3, v5, v2 +; GFX90A-NEXT: v_not_b32_e32 v4, v3 +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB70_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_nand_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_bitop3_b32 v4, v5, v3, v5 bitop3:0x3f +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB70_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw nand ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_or_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_or_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_or v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_or_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_or v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw or ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_or_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_or_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_or v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_or_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_or v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw or ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_max_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_max_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_smax v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_max_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_smax v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw max ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_max_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_max_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_smax v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_max_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_smax v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw max ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_min_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_min_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_smin v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_min_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_smin v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw min ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_min_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_min_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_smin v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_min_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_smin v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw min ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_umax_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_umax_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_umax v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umax_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_umax v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw umax ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_umax_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_umax_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_umax v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umax_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_umax v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw umax ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_umin_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_umin_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_umin v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umin_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_umin v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw umin ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_umin_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_umin_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_umin v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umin_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_umin v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw umin ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_uinc_wrap_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_uinc_wrap_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_uinc_wrap_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_uinc_wrap_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_uinc_wrap_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_uinc_wrap_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_udec_wrap_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_udec_wrap_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_udec_wrap_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw udec_wrap ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_udec_wrap_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_udec_wrap_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_udec_wrap_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw udec_wrap ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_usub_cond_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_cond_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB85_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_cond_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB85_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_u32_e32 v2, v3, v4 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB85_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw usub_cond ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_usub_cond_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_cond_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_sub_u32_e32 v3, v5, v2 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB86_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_cond_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB86_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v3 +; GFX950-NEXT: v_sub_u32_e32 v3, v5, v2 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v5, v3, vcc +; GFX950-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB86_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw usub_cond ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_sat_i32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB87_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_sat_i32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB87_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_u32_e64 v2, v3, v4 clamp +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB87_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw usub_sat ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_usub_sat_i32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_sat_i32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v3 clamp +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB88_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_sat_i32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_sub_u32_e64 v4, v5, v3 clamp +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB88_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw usub_sat ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics i64, with aa+av cases +;--------------------------------------------------------------------- + +define void @global_atomic_add_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_add_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_add_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw add ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_add_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_add_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_add_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_add_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw add ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_sub_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_sub_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_sub_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw sub ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_sub_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_sub_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_sub_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw sub ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_and_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_and_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_and_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw and ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_and_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_and_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_and_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw and ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_nand_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_nand_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v2, v5, v7 +; GFX90A-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX90A-NEXT: v_not_b32_e32 v3, v2 +; GFX90A-NEXT: v_not_b32_e32 v2, v8 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB95_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_nand_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: .LBB95_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, v5, v7 +; GFX950-NEXT: v_and_b32_e32 v8, v4, v6 +; GFX950-NEXT: v_not_b32_e32 v3, v2 +; GFX950-NEXT: v_not_b32_e32 v2, v8 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB95_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw nand ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_nand_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_nand_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX90A-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX90A-NEXT: v_not_b32_e32 v5, v4 +; GFX90A-NEXT: v_not_b32_e32 v4, v8 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB96_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_nand_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB96_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_and_b32_e32 v4, v7, v3 +; GFX950-NEXT: v_and_b32_e32 v8, v6, v2 +; GFX950-NEXT: v_not_b32_e32 v5, v4 +; GFX950-NEXT: v_not_b32_e32 v4, v8 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB96_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw nand ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_or_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_or_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_or_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw or ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_or_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_or_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_or_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw or ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_max_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_max_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_smax_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_max_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_smax_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw max ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_max_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_max_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_smax_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_max_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_smax_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw max ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_min_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_min_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_smin_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_min_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_smin_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw min ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_min_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_min_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_smin_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_min_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_smin_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw min ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_umax_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_umax_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_umax_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umax_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_umax_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw umax ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_umax_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_umax_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_umax_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umax_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_umax_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw umax ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_umin_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_umin_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_umin_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umin_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_umin_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw umin ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_umin_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_umin_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_umin_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umin_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_umin_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw umin ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_uinc_wrap_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_uinc_wrap_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_uinc_wrap_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_uinc_wrap_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_uinc_wrap_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_uinc_wrap_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_udec_wrap_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_udec_wrap_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_udec_wrap_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw udec_wrap ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_udec_wrap_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_udec_wrap_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_udec_wrap_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw udec_wrap ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_usub_cond_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_cond_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB111_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_cond_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: .LBB111_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB111_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw usub_cond ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_usub_cond_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_cond_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB112_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_cond_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB112_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB112_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw usub_cond ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_usub_sat_i64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_sat_i64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB113_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_sat_i64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: .LBB113_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v4, v6 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v7, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB113_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw usub_sat ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_usub_sat_i64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_sat_i64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB114_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_sat_i64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB114_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB114_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw usub_sat ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics f32, with aa+av cases +;--------------------------------------------------------------------- + +define void @global_atomic_fadd_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fadd_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fsub_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB117_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB117_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB117_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fsub_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v3 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB118_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_sub_f32_e32 v4, v5, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB118_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fmax_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB119_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX950-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB119_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fmax_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB120_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB120_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fmin_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX90A-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB121_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_max_f32_e32 v4, v2, v2 +; GFX950-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB121_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fmin_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX90A-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v2, v3 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB122_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX950-NEXT: .LBB122_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_max_f32_e32 v2, v5, v5 +; GFX950-NEXT: v_min_f32_e32 v4, v2, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB122_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB123_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB123_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_maximum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB123_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fmaximum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_max_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB124_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_maximum3_f32 v4, v5, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB124_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_f32_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_min_f32_e32 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB125_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_f32_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_minimum3_f32 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB125_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fminimum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_f32_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_min_f32_e32 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB126_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_f32_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_minimum3_f32 v4, v5, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB126_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics f64, with aa+av cases +;--------------------------------------------------------------------- + +define void @global_atomic_fadd_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fadd_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], -v[6:7] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB129_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: .LBB129_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_add_f64 v[2:3], v[4:5], -v[6:7] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB129_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB130_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[6:7], v[4:5] +; GFX950-NEXT: v_add_f64 v[4:5], v[6:7], -v[2:3] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB130_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fmax_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fmax_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fmin_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fmin_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX90A-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB135_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[2:3], v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB135_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[4:5], v[10:11], v[2:3] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB136_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[4:5] +; GFX950-NEXT: v_max_f64 v[4:5], v[10:11], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB136_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_f64_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX90A-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_min_f64 v[2:3], v[4:5], v[6:7] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB137_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_f64_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v7, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX950-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[2:3], v[4:5], v[6:7] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[6:7] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB137_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_f64_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_min_f64 v[4:5], v[10:11], v[2:3] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB138_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[4:5] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_f64_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:80 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[10:11], v[4:5] +; GFX950-NEXT: v_min_f64 v[4:5], v[10:11], v[2:3] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v5, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v8, v4, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[8:11], off offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[10:11] +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB138_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[4:5] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics v2f16, with aa+av cases +;--------------------------------------------------------------------- + +define void @global_atomic_fadd_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fadd_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB141_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_add_f16 v2, v3, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB141_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fsub_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB142_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_add_f16 v4, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB142_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB143_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX950-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB143_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmax_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX90A-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v3 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB144_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB144_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX90A-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX90A-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB145_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_pk_max_f16 v4, v2, v2 +; GFX950-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB145_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmin_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX90A-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v2 +; GFX90A-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v2, v3 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB146_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_pk_max_f16 v3, v3, v3 +; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_max_f16 v2, v5, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_min_f16 v4, v2, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB146_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB147_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB147_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmaximum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_pk_max_f16 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB148_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB148_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_v2f16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_min_f16 v2, v3, v4 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v3, v4 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v3, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v5, v2, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v2, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v2, v2, v6, s8 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB149_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_v2f16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v2, v3, v4, v4 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB149_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fminimum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_v2f16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_pk_min_f16 v4, v5, v2 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v3, v4, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v4, v3, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v4, v4, v6, s8 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB150_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_v2f16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v2, v[0:1], off offset:40 +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB150_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics v2bf16, with aa+av cases +;--------------------------------------------------------------------- + +define void @global_atomic_fadd_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB151_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB151_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fadd_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB152_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_add_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB152_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB153_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB153_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB153_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_sub_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB153_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fsub_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB154_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB154_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_sub_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB154_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB155_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB155_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB155_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB155_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmax_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB156_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB156_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB156_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB157_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB157_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB157_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB157_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmin_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB158_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB158_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB158_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB159_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX90A-NEXT: v_max_f32_e32 v8, v2, v4 +; GFX90A-NEXT: v_max_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB159_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB159_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_maximum3_f32 v2, v2, v4, v4 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB159_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmaximum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB160_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX90A-NEXT: v_max_f32_e32 v8, v5, v2 +; GFX90A-NEXT: v_max_f32_e32 v9, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB160_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2 +; GFX950-NEXT: v_maximum3_f32 v6, v6, v3, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB160_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_v2bf16_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB161_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX90A-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX90A-NEXT: v_min_f32_e32 v8, v2, v4 +; GFX90A-NEXT: v_min_f32_e32 v9, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v7, v6 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v5, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v5, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX90A-NEXT: v_add3_u32 v8, v8, v2, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v7, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v2, v7, v2, s9 +; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB161_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_v2bf16_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v3, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX950-NEXT: .LBB161_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX950-NEXT: v_minimum3_f32 v2, v2, v4, v4 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v5, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v6, v2 +; GFX950-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB161_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fminimum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_v2bf16_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB162_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v7, v5 +; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v7 +; GFX90A-NEXT: v_min_f32_e32 v8, v5, v2 +; GFX90A-NEXT: v_min_f32_e32 v9, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v4 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v5, v2 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v3, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v3, v9, vcc +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_add3_u32 v10, v10, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v5, v5 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v9, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX90A-NEXT: v_perm_b32 v6, v6, v5, s9 +; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB162_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v5 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_v2bf16_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v4, v[0:1], off offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2 +; GFX950-NEXT: v_minimum3_f32 v6, v6, v3, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v6, v4 +; GFX950-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX950-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-NEXT: s_cbranch_execnz .LBB162_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v4 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics i32, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @global_atomic_xchg_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_xchg_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_swap v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_xchg_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_xchg_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_swap v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_add_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_add_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_add v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_add_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_add v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw add ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_add_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_add_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_add v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_add_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_add v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw add ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_sub_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_sub_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_sub v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_sub_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_sub v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw sub ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_sub_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_sub_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_sub v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_sub_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_sub v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw sub ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_and_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_and_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_and v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_and_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_and v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw and ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_and_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_and_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_and v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_and_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_and v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw and ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_nand_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB171_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, v1, v3 +; GFX90A-NEXT: v_not_b32_e32 v0, v0 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB171_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_nand_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB171_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_bitop3_b32 v0, v1, v3, v1 bitop3:0x3f +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB171_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw nand ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_nand_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_nand_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v2, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB172_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_and_b32_e32 v2, v3, v1 +; GFX90A-NEXT: v_not_b32_e32 v2, v2 +; GFX90A-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB172_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_nand_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_bitop3_b32 v4, v5, v2, v5 bitop3:0x3f +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB172_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw nand ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_or_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_or_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_or v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_or_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_or v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw or ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_or_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_or_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_or v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_or_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_or v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw or ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_xor_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_xor v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_xor v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw xor ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_xor_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_xor v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_max_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_max_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_smax v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_max_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_smax v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw max ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_max_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_max_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_smax v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_max_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_smax v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw max ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_min_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_min_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_smin v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_min_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_smin v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw min ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_min_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_min_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_smin v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_min_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_smin v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw min ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_umax_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_umax_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_umax v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umax_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_umax v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw umax ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_umax_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_umax_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_umax v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umax_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_umax v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw umax ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_umin_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_umin_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_umin v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umin_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_umin v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw umin ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_umin_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_umin_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_umin v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umin_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_umin v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw umin ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_uinc_wrap_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_uinc_wrap_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_inc v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_uinc_wrap_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_inc v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_uinc_wrap_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_uinc_wrap_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_inc v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_uinc_wrap_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_inc v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_udec_wrap_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_udec_wrap_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_dec v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_udec_wrap_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_dec v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw udec_wrap ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_udec_wrap_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_udec_wrap_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_dec v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_udec_wrap_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_dec v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw udec_wrap ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_usub_cond_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_cond_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB189_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_u32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB189_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_cond_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB189_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_u32_e32 v0, v1, v3 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB189_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw usub_cond ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_usub_cond_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_cond_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v2, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB190_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_sub_u32_e32 v2, v3, v1 +; GFX90A-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB190_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_cond_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v2, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB190_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_sub_u32_e32 v2, v3, v1 +; GFX950-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX950-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB190_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw usub_cond ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_sat_i32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB191_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_u32_e64 v0, v1, v3 clamp +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB191_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_sat_i32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB191_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_u32_e64 v0, v1, v3 clamp +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB191_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=a"() + %result = atomicrmw usub_sat ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i32 %result) + ret void +} + +define void @global_atomic_usub_sat_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_sat_i32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB192_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_sub_u32_e64 v4, v5, v2 clamp +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB192_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_sat_i32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_sub_u32_e64 v4, v5, v2 clamp +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB192_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i32 asm "; def $0", "=^VA"() + %result = atomicrmw usub_sat ptr addrspace(1) %gep.0, i32 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i32 %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics i64, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @global_atomic_xchg_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_xchg_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_xchg_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_xchg_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xchg_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xchg ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_add_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_add_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_add_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw add ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_add_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_add_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_add_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw add ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_sub_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_sub_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_sub_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw sub ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_sub_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_sub_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_sub_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw sub ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_and_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_and_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_and_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw and ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_and_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_and_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_and_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw and ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_nand_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_nand_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB201_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX90A-NEXT: v_and_b32_e32 v7, v2, v4 +; GFX90A-NEXT: v_not_b32_e32 v1, v0 +; GFX90A-NEXT: v_not_b32_e32 v0, v7 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB201_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_nand_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB201_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, v3, v5 +; GFX950-NEXT: v_and_b32_e32 v7, v2, v4 +; GFX950-NEXT: v_not_b32_e32 v1, v0 +; GFX950-NEXT: v_not_b32_e32 v0, v7 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB201_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw nand ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_nand_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_nand_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB202_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_and_b32_e32 v2, v9, v1 +; GFX90A-NEXT: v_and_b32_e32 v3, v8, v0 +; GFX90A-NEXT: v_not_b32_e32 v7, v2 +; GFX90A-NEXT: v_not_b32_e32 v6, v3 +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB202_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_nand_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB202_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_and_b32_e32 v2, v9, v1 +; GFX950-NEXT: v_and_b32_e32 v3, v8, v0 +; GFX950-NEXT: v_not_b32_e32 v7, v2 +; GFX950-NEXT: v_not_b32_e32 v6, v3 +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB202_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw nand ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_or_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_or_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_or_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw or ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_or_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_or_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_or_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw or ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_xor_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw xor ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_xor_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_xor_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_xor_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw xor ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_max_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_max_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_max_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw max ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_max_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_max_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_max_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw max ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_min_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_min_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_min_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw min ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_min_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_min_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_min_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw min ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_umax_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_umax_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umax_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw umax ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_umax_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_umax_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umax_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw umax ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_umin_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_umin_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umin_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw umin ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_umin_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_umin_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_umin_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw umin ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_uinc_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_uinc_wrap_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_uinc_wrap_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_uinc_wrap_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_uinc_wrap_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_uinc_wrap_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw uinc_wrap ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_udec_wrap_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_udec_wrap_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_udec_wrap_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw udec_wrap ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_udec_wrap_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_udec_wrap_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_udec_wrap_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw udec_wrap ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_usub_cond_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_cond_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB219_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB219_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_cond_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB219_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB219_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw usub_cond ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_usub_cond_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_cond_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB220_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX90A-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB220_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_cond_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB220_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB220_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw usub_cond ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +define void @global_atomic_usub_sat_i64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_sat_i64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB221_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB221_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_sat_i64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB221_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v2, v4 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB221_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=a"() + %result = atomicrmw usub_sat ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(i64 %result) + ret void +} + +define void @global_atomic_usub_sat_i64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_usub_sat_i64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB222_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX90A-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB222_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_usub_sat_i64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB222_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v8, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v1, vcc +; GFX950-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v7, v3, 0, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB222_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x i64], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call i64 asm "; def $0", "=^VA"() + %result = atomicrmw usub_sat ptr addrspace(1) %gep.0, i64 %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(i64 %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics f32, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @global_atomic_fadd_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fadd_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fsub_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB225_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_sub_f32_e32 v0, v1, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB225_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB225_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_sub_f32_e32 v0, v1, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB225_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fsub_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB226_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB226_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB226_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_sub_f32_e32 v4, v5, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB226_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fmax_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX90A-NEXT: .LBB227_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB227_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX950-NEXT: .LBB227_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB227_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fmax_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .LBB228_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX90A-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB228_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX950-NEXT: .LBB228_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX950-NEXT: v_max_f32_e32 v4, v1, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB228_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fmin_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX90A-NEXT: .LBB229_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB229_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_max_f32_e32 v3, v0, v0 +; GFX950-NEXT: .LBB229_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB229_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fmin_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX90A-NEXT: .LBB230_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX90A-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB230_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX950-NEXT: .LBB230_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_max_f32_e32 v1, v5, v5 +; GFX950-NEXT: v_min_f32_e32 v4, v1, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB230_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX90A-NEXT: .LBB231_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB231_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_maximum3_f32 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB231_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fmaximum_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB232_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_max_f32_e32 v3, v5, v1 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB232_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_maximum3_f32 v4, v5, v2, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB232_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_f32_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX90A-NEXT: .LBB233_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_min_f32_e32 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB233_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_f32_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_minimum3_f32 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB233_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "a"(float %result) + ret void +} + +define void @global_atomic_fminimum_f32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_f32_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB234_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_min_f32_e32 v3, v5, v1 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB234_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_f32_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_minimum3_f32 v4, v5, v2, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB234_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call float asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, float %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 + call void asm "; use $0", "^VA"(float %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics f64, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @global_atomic_fadd_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fadd_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB237_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB237_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: .LBB237_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB237_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fsub_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB238_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB238_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB238_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_add_f64 v[6:7], v[8:9], -v[0:1] +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB238_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fmax_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fmax_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fmin_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX950-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fmin_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: .LBB243_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB243_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX950-NEXT: .LBB243_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_max_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB243_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fmaximum_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB244_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_max_f64 v[2:3], v[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB244_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB244_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_max_f64 v[2:3], v[8:9], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB244_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_f64_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v6, s[16:17] offset:80 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX90A-NEXT: .LBB245_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB245_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_f64_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v6, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] offset:80 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v5, a1 +; GFX950-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX950-NEXT: .LBB245_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_min_f64 v[0:1], v[2:3], v[4:5] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[4:5] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b64_e32 v[2:3], v[0:1] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB245_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(double %result) + ret void +} + +define void @global_atomic_fminimum_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_f64_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: global_load_dwordx2 v[2:3], v4, s[16:17] offset:80 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB246_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: v_min_f64 v[2:3], v[8:9], v[0:1] +; GFX90A-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX90A-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc +; GFX90A-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[16:17] offset:80 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB246_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v[2:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_f64_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:80 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v[0:1] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB246_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[8:9], v[2:3] +; GFX950-NEXT: v_min_f64 v[2:3], v[8:9], v[0:1] +; GFX950-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[0:1] +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v3, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e64 v6, v2, 0, vcc +; GFX950-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[6:9], s[0:1] offset:80 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9] +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB246_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v[2:3] +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call double asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, double %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(double %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics v2f16, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @global_atomic_fadd_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_pk_add_f16 v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fadd_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v0, v1, s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_pk_add_f16 v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB249_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_add_f16 v0, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB249_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB249_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_add_f16 v0, v1, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB249_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fsub_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB250_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB250_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_pk_add_f16 v4, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB250_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0 +; GFX90A-NEXT: .LBB251_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB251_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v0 +; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_max_f16 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB251_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmax_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: .LBB252_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX90A-NEXT: v_pk_max_f16 v4, v1, v2 +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB252_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_max_f16 v4, v1, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB252_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v3, v0, v0 +; GFX90A-NEXT: .LBB253_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX90A-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB253_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_pk_max_f16 v3, v0, v0 +; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_min_f16 v0, v0, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB253_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmin_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v1, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX90A-NEXT: .LBB254_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v1 +; GFX90A-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX90A-NEXT: v_pk_min_f16 v4, v1, v2 +; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT: s_cbranch_execnz .LBB254_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_pk_max_f16 v2, v2, v2 +; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_pk_max_f16 v1, v5, v5 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_pk_min_f16 v4, v1, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB254_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: .LBB255_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_max_f16 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v4, v0, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s8 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB255_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB255_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_maximum3_f16 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB255_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB256_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_max_f16 v3, v5, v1 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v3, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v4, v3, v4, s8 +; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB256_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_pk_maximum3_f16 v4, v5, v2, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB256_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_v2f16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: .LBB257_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_pk_min_f16 v0, v1, v3 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v1, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v5, v4, v0, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v0, v4, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v0, v0, v5, s8 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB257_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_v2f16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX950-NEXT: .LBB257_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_pk_minimum3_f16 v0, v1, v3, v3 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB257_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x half> %result) + ret void +} + +define void @global_atomic_fminimum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_v2f16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7e00 +; GFX90A-NEXT: s_mov_b32 s8, 0x5040100 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: .LBB258_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_pk_min_f16 v3, v5, v1 +; GFX90A-NEXT: v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:WORD_1 +; GFX90A-NEXT: v_cmp_o_f16_e64 s[4:5], v5, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_sdwa v3, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX90A-NEXT: v_perm_b32 v4, v3, v4, s8 +; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB258_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_v2f16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v1, v0, s[0:1] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v1 +; GFX950-NEXT: v_pk_minimum3_f16 v4, v5, v2, v2 +; GFX950-NEXT: global_atomic_cmpswap v1, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB258_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x half> asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x half> %result) + ret void +} + +;--------------------------------------------------------------------- +; other atomics v2bf16, with aa+av cases using saddr +;--------------------------------------------------------------------- + +define void @global_atomic_fadd_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB259_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_add_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB259_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX950-NEXT: global_atomic_pk_add_bf16 v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fadd_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fadd_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB260_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX90A-NEXT: v_add_f32_e32 v3, v3, v1 +; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB260_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fadd_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v1 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: global_atomic_pk_add_bf16 v0, v0, v1, s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fadd ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB261_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB261_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX950-NEXT: .LBB261_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX950-NEXT: v_sub_f32_e32 v5, v5, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB261_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fsub_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fsub_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB262_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX90A-NEXT: v_sub_f32_e32 v3, v3, v1 +; GFX90A-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB262_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fsub_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX950-NEXT: v_sub_f32_e32 v3, v3, v1 +; GFX950-NEXT: v_sub_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 +; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB262_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fsub ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB263_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB263_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX950-NEXT: .LBB263_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB263_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmax_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmax_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB264_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v3, v3, v1 +; GFX90A-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB264_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmax_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v1 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 +; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB264_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmax ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB265_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX90A-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB265_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX950-NEXT: .LBB265_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB265_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmin_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmin_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v3, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v2 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB266_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v3 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v3, v3, v1 +; GFX90A-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8 +; GFX90A-NEXT: v_add3_u32 v8, v8, v4, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc +; GFX90A-NEXT: v_perm_b32 v4, v4, v3, s9 +; GFX90A-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB266_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmin_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v1 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 +; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB266_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmin ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB267_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_max_f32_e32 v7, v0, v3 +; GFX90A-NEXT: v_max_f32_e32 v8, v6, v5 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v5 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v4, v8, vcc +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB267_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX950-NEXT: .LBB267_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_maximum3_f32 v0, v0, v3, v3 +; GFX950-NEXT: v_maximum3_f32 v5, v5, v4, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB267_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fmaximum_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v4, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB268_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_max_f32_e32 v7, v4, v1 +; GFX90A-NEXT: v_max_f32_e32 v8, v6, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v3 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v4, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB268_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fmaximum_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX950-NEXT: v_maximum3_f32 v3, v3, v1, v1 +; GFX950-NEXT: v_maximum3_f32 v4, v4, v2, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 +; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB268_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fmaximum ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_v2bf16_saddr_ret_a_a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: global_load_dword v1, v2, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB269_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX90A-NEXT: v_min_f32_e32 v7, v0, v3 +; GFX90A-NEXT: v_min_f32_e32 v8, v6, v5 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v5 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v0, v3 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v4, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v4, v8, vcc +; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9 +; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB269_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use a0 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_v2bf16_saddr_ret_a_a: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-NEXT: global_load_dword v1, v2, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX950-NEXT: .LBB269_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_minimum3_f32 v0, v0, v3, v3 +; GFX950-NEXT: v_minimum3_f32 v5, v5, v4, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v0 +; GFX950-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB269_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use a0 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=a"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "a"(<2 x bfloat> %result) + ret void +} + +define void @global_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %ptr) #0 { +; GFX90A-LABEL: global_atomic_fminimum_v2bf16_saddr_ret_av_av: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: global_load_dword v4, v0, s[16:17] offset:40 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v3 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_mov_b64 s[6:7], 0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; GFX90A-NEXT: s_movk_i32 s8, 0x7fff +; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX90A-NEXT: s_mov_b32 s9, 0x7060302 +; GFX90A-NEXT: .LBB270_1: ; %atomicrmw.start +; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, v4 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX90A-NEXT: v_min_f32_e32 v7, v4, v1 +; GFX90A-NEXT: v_min_f32_e32 v8, v6, v3 +; GFX90A-NEXT: v_cmp_o_f32_e32 vcc, v6, v3 +; GFX90A-NEXT: v_cmp_o_f32_e64 s[4:5], v4, v1 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v2, v7, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc +; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s8 +; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8 +; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v4, v4 +; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v8, s[4:5] +; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX90A-NEXT: v_perm_b32 v4, v6, v4, s9 +; GFX90A-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[16:17] offset:40 glc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX90A-NEXT: s_cbranch_execnz .LBB270_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v4 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_atomic_fminimum_v2bf16_saddr_ret_av_av: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-NEXT: global_load_dword v3, v0, s[0:1] offset:40 +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; def v2 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v5, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX950-NEXT: v_minimum3_f32 v3, v3, v1, v1 +; GFX950-NEXT: v_minimum3_f32 v4, v4, v2, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v3 +; GFX950-NEXT: global_atomic_cmpswap v3, v0, v[4:5], s[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB270_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: ;;#ASMSTART +; GFX950-NEXT: ; use v3 +; GFX950-NEXT: ;;#ASMEND +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr addrspace(1) %ptr, i64 0, i64 10 + %data = call <2 x bfloat> asm "; def $0", "=^VA"() + %result = atomicrmw fminimum ptr addrspace(1) %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, !amdgpu.no.fine.grained.memory !0 + call void asm "; use $0", "^VA"(<2 x bfloat> %result) + ret void +} + attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" } + +!0 = !{} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 004d3c0c1cf53..3dedf008c917e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -1,8 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s -; Test using saddr addressing mode of flat_* atomic instructions. +; Test using saddr addressing mode of flat_* atomic instructions. Make +; sure these are not incorrectly selected before gfx1250. define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn: @@ -11,6 +14,29 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -25,6 +51,29 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:2047 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:2047 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047 @@ -40,6 +89,35 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff800, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 @@ -55,6 +133,29 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -70,6 +171,29 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_2048: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:2048 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn_2048: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:2048 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2048 @@ -86,6 +210,35 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_neg2048: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn_neg2048: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff800, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 @@ -128,6 +281,33 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -164,6 +344,33 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:42 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:42 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -199,6 +406,33 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -232,6 +466,33 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:42 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:42 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -338,6 +599,86 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB10_5 ; GFX1250-GISEL-NEXT: .LBB10_5: +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB10_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB10_4 +; GFX950-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB10_5 +; GFX950-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB10_2 +; GFX950-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB10_5 +; GFX950-SDAG-NEXT: .LBB10_5: +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB10_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB10_4 +; GFX950-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB10_5 +; GFX950-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB10_2 +; GFX950-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB10_5 +; GFX950-GISEL-NEXT: .LBB10_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -441,6 +782,92 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB11_5 ; GFX1250-GISEL-NEXT: .LBB11_5: +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB11_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB11_4 +; GFX950-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB11_5 +; GFX950-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB11_2 +; GFX950-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB11_5 +; GFX950-SDAG-NEXT: .LBB11_5: +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB11_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB11_4 +; GFX950-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB11_5 +; GFX950-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX950-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB11_5 +; GFX950-GISEL-NEXT: .LBB11_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -522,6 +949,72 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB12_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB12_4 +; GFX950-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB12_2 +; GFX950-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB12_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB12_4 +; GFX950-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB12_2 +; GFX950-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -607,6 +1100,78 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB13_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB13_4 +; GFX950-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB13_2 +; GFX950-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB13_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB13_4 +; GFX950-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB13_2 +; GFX950-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -626,6 +1191,29 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_add_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -641,6 +1229,35 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_add_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -656,6 +1273,29 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_add_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_add_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -669,6 +1309,35 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_add_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_add_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -766,6 +1435,90 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB18_5 ; GFX1250-GISEL-NEXT: .LBB18_5: +; +; GFX950-SDAG-LABEL: flat_add_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB18_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB18_4 +; GFX950-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB18_5 +; GFX950-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB18_2 +; GFX950-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB18_5 +; GFX950-SDAG-NEXT: .LBB18_5: +; +; GFX950-GISEL-LABEL: flat_add_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB18_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB18_4 +; GFX950-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB18_5 +; GFX950-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB18_2 +; GFX950-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB18_5 +; GFX950-GISEL-NEXT: .LBB18_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -869,6 +1622,96 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB19_5 ; GFX1250-GISEL-NEXT: .LBB19_5: +; +; GFX950-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB19_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB19_4 +; GFX950-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB19_5 +; GFX950-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB19_2 +; GFX950-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB19_5 +; GFX950-SDAG-NEXT: .LBB19_5: +; +; GFX950-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB19_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB19_4 +; GFX950-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB19_5 +; GFX950-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB19_2 +; GFX950-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB19_5 +; GFX950-GISEL-NEXT: .LBB19_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -956,6 +1799,80 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_add_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB20_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB20_4 +; GFX950-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB20_2 +; GFX950-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_add_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB20_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB20_4 +; GFX950-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB20_2 +; GFX950-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1047,6 +1964,86 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB21_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB21_4 +; GFX950-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB21_2 +; GFX950-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB21_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB21_4 +; GFX950-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB21_2 +; GFX950-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1066,6 +2063,29 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1081,6 +2101,35 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1096,6 +2145,29 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1109,6 +2181,35 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1206,6 +2307,92 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB26_5 ; GFX1250-GISEL-NEXT: .LBB26_5: +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB26_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB26_4 +; GFX950-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB26_5 +; GFX950-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB26_2 +; GFX950-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB26_5 +; GFX950-SDAG-NEXT: .LBB26_5: +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB26_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB26_4 +; GFX950-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB26_5 +; GFX950-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB26_2 +; GFX950-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB26_5 +; GFX950-GISEL-NEXT: .LBB26_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1309,6 +2496,98 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB27_5 ; GFX1250-GISEL-NEXT: .LBB27_5: +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB27_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB27_4 +; GFX950-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB27_5 +; GFX950-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB27_2 +; GFX950-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB27_5 +; GFX950-SDAG-NEXT: .LBB27_5: +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB27_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB27_4 +; GFX950-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB27_5 +; GFX950-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB27_2 +; GFX950-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB27_5 +; GFX950-GISEL-NEXT: .LBB27_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1396,6 +2675,82 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB28_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB28_4 +; GFX950-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB28_2 +; GFX950-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB28_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB28_4 +; GFX950-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB28_2 +; GFX950-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1487,6 +2842,88 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB29_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB29_4 +; GFX950-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB29_2 +; GFX950-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB29_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB29_4 +; GFX950-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB29_2 +; GFX950-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1506,6 +2943,29 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_and_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1521,6 +2981,35 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_and_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1536,6 +3025,29 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_and_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_and_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1549,6 +3061,35 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_and_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_and_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1648,6 +3189,90 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB34_5 ; GFX1250-GISEL-NEXT: .LBB34_5: +; +; GFX950-SDAG-LABEL: flat_and_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB34_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB34_4 +; GFX950-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB34_5 +; GFX950-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB34_2 +; GFX950-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB34_5 +; GFX950-SDAG-NEXT: .LBB34_5: +; +; GFX950-GISEL-LABEL: flat_and_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB34_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB34_4 +; GFX950-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB34_5 +; GFX950-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB34_2 +; GFX950-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB34_5 +; GFX950-GISEL-NEXT: .LBB34_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1753,6 +3378,96 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB35_5 ; GFX1250-GISEL-NEXT: .LBB35_5: +; +; GFX950-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB35_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB35_4 +; GFX950-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB35_5 +; GFX950-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB35_2 +; GFX950-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB35_5 +; GFX950-SDAG-NEXT: .LBB35_5: +; +; GFX950-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB35_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB35_4 +; GFX950-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB35_5 +; GFX950-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB35_2 +; GFX950-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB35_5 +; GFX950-GISEL-NEXT: .LBB35_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1842,6 +3557,80 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_and_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB36_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB36_4 +; GFX950-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB36_2 +; GFX950-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_and_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB36_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB36_4 +; GFX950-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB36_2 +; GFX950-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1935,6 +3724,86 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB37_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB37_4 +; GFX950-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB37_2 +; GFX950-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB37_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB37_4 +; GFX950-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB37_2 +; GFX950-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1954,6 +3823,29 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_or_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1969,6 +3861,35 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_or_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1984,6 +3905,29 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_or_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_or_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1997,6 +3941,35 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_or_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_or_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2096,6 +4069,90 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB42_5 ; GFX1250-GISEL-NEXT: .LBB42_5: +; +; GFX950-SDAG-LABEL: flat_or_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB42_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB42_4 +; GFX950-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB42_5 +; GFX950-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB42_2 +; GFX950-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB42_5 +; GFX950-SDAG-NEXT: .LBB42_5: +; +; GFX950-GISEL-LABEL: flat_or_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB42_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB42_4 +; GFX950-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB42_5 +; GFX950-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB42_2 +; GFX950-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB42_5 +; GFX950-GISEL-NEXT: .LBB42_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -2201,6 +4258,96 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB43_5 ; GFX1250-GISEL-NEXT: .LBB43_5: +; +; GFX950-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB43_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB43_4 +; GFX950-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB43_5 +; GFX950-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB43_2 +; GFX950-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB43_5 +; GFX950-SDAG-NEXT: .LBB43_5: +; +; GFX950-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB43_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB43_4 +; GFX950-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB43_5 +; GFX950-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB43_2 +; GFX950-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB43_5 +; GFX950-GISEL-NEXT: .LBB43_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2290,6 +4437,80 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_or_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB44_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB44_4 +; GFX950-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB44_2 +; GFX950-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_or_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB44_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB44_4 +; GFX950-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB44_2 +; GFX950-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -2383,6 +4604,86 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB45_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB45_4 +; GFX950-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB45_2 +; GFX950-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB45_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB45_4 +; GFX950-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB45_2 +; GFX950-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2402,6 +4703,29 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -2417,6 +4741,35 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2432,6 +4785,29 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -2445,6 +4821,35 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2544,6 +4949,90 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB50_5 ; GFX1250-GISEL-NEXT: .LBB50_5: +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB50_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB50_4 +; GFX950-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB50_5 +; GFX950-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB50_2 +; GFX950-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB50_5 +; GFX950-SDAG-NEXT: .LBB50_5: +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB50_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB50_4 +; GFX950-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB50_5 +; GFX950-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB50_2 +; GFX950-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB50_5 +; GFX950-GISEL-NEXT: .LBB50_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -2649,6 +5138,96 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB51_5 ; GFX1250-GISEL-NEXT: .LBB51_5: +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB51_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB51_4 +; GFX950-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB51_5 +; GFX950-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB51_2 +; GFX950-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB51_5 +; GFX950-SDAG-NEXT: .LBB51_5: +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB51_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB51_4 +; GFX950-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB51_5 +; GFX950-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB51_2 +; GFX950-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB51_5 +; GFX950-GISEL-NEXT: .LBB51_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2738,6 +5317,80 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB52_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB52_4 +; GFX950-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB52_2 +; GFX950-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB52_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB52_4 +; GFX950-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB52_2 +; GFX950-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -2831,6 +5484,86 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB53_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB53_4 +; GFX950-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB53_2 +; GFX950-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB53_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB53_4 +; GFX950-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB53_2 +; GFX950-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2848,6 +5581,25 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_max_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_smax v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_max_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smax v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2861,6 +5613,31 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_max_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_smax v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_max_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smax v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2875,6 +5652,25 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2887,6 +5683,31 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2986,6 +5807,92 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_5: +; +; GFX950-SDAG-LABEL: flat_max_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_4 +; GFX950-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB58_5 +; GFX950-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB58_2 +; GFX950-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB58_5 +; GFX950-SDAG-NEXT: .LBB58_5: +; +; GFX950-GISEL-LABEL: flat_max_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_4 +; GFX950-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB58_5 +; GFX950-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB58_2 +; GFX950-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB58_5 +; GFX950-GISEL-NEXT: .LBB58_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3091,6 +5998,98 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_5: +; +; GFX950-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_4 +; GFX950-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB59_5 +; GFX950-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB59_2 +; GFX950-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB59_5 +; GFX950-SDAG-NEXT: .LBB59_5: +; +; GFX950-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_4 +; GFX950-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB59_5 +; GFX950-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB59_2 +; GFX950-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB59_5 +; GFX950-GISEL-NEXT: .LBB59_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3176,6 +6175,80 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_max_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB60_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB60_4 +; GFX950-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB60_2 +; GFX950-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_max_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB60_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB60_4 +; GFX950-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB60_2 +; GFX950-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3265,6 +6338,86 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB61_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB61_4 +; GFX950-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB61_2 +; GFX950-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB61_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB61_4 +; GFX950-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB61_2 +; GFX950-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3282,6 +6435,25 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_min_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_smin v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_min_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smin v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -3295,6 +6467,31 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_min_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_smin v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_min_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smin v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3309,6 +6506,25 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -3321,6 +6537,31 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3420,6 +6661,92 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_5: +; +; GFX950-SDAG-LABEL: flat_min_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_4 +; GFX950-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB66_5 +; GFX950-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB66_2 +; GFX950-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB66_5 +; GFX950-SDAG-NEXT: .LBB66_5: +; +; GFX950-GISEL-LABEL: flat_min_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_4 +; GFX950-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB66_5 +; GFX950-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB66_2 +; GFX950-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB66_5 +; GFX950-GISEL-NEXT: .LBB66_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3525,6 +6852,98 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_5: +; +; GFX950-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_4 +; GFX950-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB67_5 +; GFX950-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB67_2 +; GFX950-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB67_5 +; GFX950-SDAG-NEXT: .LBB67_5: +; +; GFX950-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_4 +; GFX950-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB67_5 +; GFX950-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB67_2 +; GFX950-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB67_5 +; GFX950-GISEL-NEXT: .LBB67_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3610,6 +7029,80 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_min_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB68_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB68_4 +; GFX950-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB68_2 +; GFX950-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_min_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB68_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB68_4 +; GFX950-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB68_2 +; GFX950-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3699,6 +7192,86 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB69_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB69_4 +; GFX950-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB69_2 +; GFX950-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB69_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB69_4 +; GFX950-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB69_2 +; GFX950-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3716,6 +7289,25 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_umax v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umax v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -3729,6 +7321,31 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_umax v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umax v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3743,6 +7360,25 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -3755,6 +7391,31 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3854,6 +7515,92 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_5: +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_4 +; GFX950-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB74_5 +; GFX950-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB74_2 +; GFX950-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB74_5 +; GFX950-SDAG-NEXT: .LBB74_5: +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_4 +; GFX950-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB74_5 +; GFX950-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB74_2 +; GFX950-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB74_5 +; GFX950-GISEL-NEXT: .LBB74_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3959,6 +7706,98 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_5: +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_4 +; GFX950-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB75_5 +; GFX950-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB75_2 +; GFX950-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB75_5 +; GFX950-SDAG-NEXT: .LBB75_5: +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_4 +; GFX950-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB75_5 +; GFX950-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB75_2 +; GFX950-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB75_5 +; GFX950-GISEL-NEXT: .LBB75_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4044,6 +7883,80 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB76_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB76_4 +; GFX950-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB76_2 +; GFX950-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB76_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB76_4 +; GFX950-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB76_2 +; GFX950-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -4133,6 +8046,86 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB77_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB77_4 +; GFX950-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB77_2 +; GFX950-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB77_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB77_4 +; GFX950-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB77_2 +; GFX950-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4150,6 +8143,25 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_umin v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umin v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -4163,6 +8175,31 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_umin v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umin v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4177,6 +8214,25 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -4189,6 +8245,31 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4288,6 +8369,92 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_5: +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_4 +; GFX950-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB82_5 +; GFX950-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB82_2 +; GFX950-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB82_5 +; GFX950-SDAG-NEXT: .LBB82_5: +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_4 +; GFX950-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB82_5 +; GFX950-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB82_2 +; GFX950-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB82_5 +; GFX950-GISEL-NEXT: .LBB82_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -4393,6 +8560,98 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_5: +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_4 +; GFX950-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB83_5 +; GFX950-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB83_2 +; GFX950-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB83_5 +; GFX950-SDAG-NEXT: .LBB83_5: +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_4 +; GFX950-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB83_5 +; GFX950-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB83_2 +; GFX950-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB83_5 +; GFX950-GISEL-NEXT: .LBB83_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4478,6 +8737,80 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB84_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB84_4 +; GFX950-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB84_2 +; GFX950-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB84_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB84_4 +; GFX950-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB84_2 +; GFX950-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -4567,6 +8900,86 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB85_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB85_4 +; GFX950-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB85_2 +; GFX950-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB85_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB85_4 +; GFX950-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB85_2 +; GFX950-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4589,6 +9002,30 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst @@ -4608,6 +9045,36 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4627,6 +9094,30 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst @@ -4643,6 +9134,36 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4748,6 +9269,98 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB90_5 ; GFX1250-GISEL-NEXT: .LBB90_5: +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB90_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB90_4 +; GFX950-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB90_5 +; GFX950-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB90_2 +; GFX950-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v8, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v8, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB90_5 +; GFX950-SDAG-NEXT: .LBB90_5: +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB90_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB90_4 +; GFX950-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB90_5 +; GFX950-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB90_2 +; GFX950-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB90_5 +; GFX950-GISEL-NEXT: .LBB90_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst @@ -4860,6 +9473,104 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB91_5 ; GFX1250-GISEL-NEXT: .LBB91_5: +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB91_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB91_4 +; GFX950-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB91_5 +; GFX950-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB91_2 +; GFX950-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v8, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v8, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB91_5 +; GFX950-SDAG-NEXT: .LBB91_5: +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB91_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB91_4 +; GFX950-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB91_5 +; GFX950-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB91_2 +; GFX950-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB91_5 +; GFX950-GISEL-NEXT: .LBB91_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4956,6 +9667,88 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB92_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB92_4 +; GFX950-SDAG-NEXT: .LBB92_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB92_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB92_2 +; GFX950-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB92_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB92_4 +; GFX950-GISEL-NEXT: .LBB92_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB92_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB92_2 +; GFX950-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst @@ -5055,6 +9848,94 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB93_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB93_4 +; GFX950-SDAG-NEXT: .LBB93_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB93_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB93_2 +; GFX950-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB93_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB93_4 +; GFX950-GISEL-NEXT: .LBB93_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB93_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB93_2 +; GFX950-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5072,6 +9953,25 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_inc v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_inc v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic @@ -5085,6 +9985,31 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_inc v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_inc v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5098,6 +10023,23 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_inc v[0:1], v2 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_inc v[2:3], v1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic @@ -5109,6 +10051,29 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_inc v[0:1], v2 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_inc v[2:3], v1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5212,6 +10177,96 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB98_5 ; GFX1250-GISEL-NEXT: .LBB98_5: +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB98_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB98_4 +; GFX950-SDAG-NEXT: .LBB98_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB98_5 +; GFX950-SDAG-NEXT: .LBB98_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB98_2 +; GFX950-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB98_5 +; GFX950-SDAG-NEXT: .LBB98_5: +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB98_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB98_4 +; GFX950-GISEL-NEXT: .LBB98_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB98_5 +; GFX950-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB98_2 +; GFX950-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB98_5 +; GFX950-GISEL-NEXT: .LBB98_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic @@ -5321,6 +10376,102 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB99_5 ; GFX1250-GISEL-NEXT: .LBB99_5: +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB99_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB99_4 +; GFX950-SDAG-NEXT: .LBB99_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB99_5 +; GFX950-SDAG-NEXT: .LBB99_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB99_2 +; GFX950-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB99_5 +; GFX950-SDAG-NEXT: .LBB99_5: +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB99_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB99_4 +; GFX950-GISEL-NEXT: .LBB99_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB99_5 +; GFX950-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB99_2 +; GFX950-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB99_5 +; GFX950-GISEL-NEXT: .LBB99_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5410,6 +10561,82 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB100_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB100_4 +; GFX950-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB100_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB100_2 +; GFX950-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB100_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB100_4 +; GFX950-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB100_2 +; GFX950-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic @@ -5503,6 +10730,88 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB101_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB101_4 +; GFX950-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB101_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB101_2 +; GFX950-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB101_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB101_4 +; GFX950-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB101_2 +; GFX950-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5521,6 +10830,25 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_dec v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_dec v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic @@ -5534,6 +10862,31 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_dec v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_dec v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5547,6 +10900,23 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_dec v[0:1], v2 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_dec v[2:3], v1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic @@ -5558,6 +10928,29 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_dec v[0:1], v2 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_dec v[2:3], v1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5665,6 +11058,98 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-GISEL-NEXT: s_branch .LBB106_5 ; GFX1250-GISEL-NEXT: .LBB106_5: +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB106_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB106_4 +; GFX950-SDAG-NEXT: .LBB106_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB106_5 +; GFX950-SDAG-NEXT: .LBB106_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB106_2 +; GFX950-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB106_5 +; GFX950-SDAG-NEXT: .LBB106_5: +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB106_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB106_4 +; GFX950-GISEL-NEXT: .LBB106_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB106_5 +; GFX950-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB106_2 +; GFX950-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB106_5 +; GFX950-GISEL-NEXT: .LBB106_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic @@ -5778,6 +11263,104 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-GISEL-NEXT: s_branch .LBB107_5 ; GFX1250-GISEL-NEXT: .LBB107_5: +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB107_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB107_4 +; GFX950-SDAG-NEXT: .LBB107_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB107_5 +; GFX950-SDAG-NEXT: .LBB107_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB107_2 +; GFX950-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB107_5 +; GFX950-SDAG-NEXT: .LBB107_5: +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB107_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB107_4 +; GFX950-GISEL-NEXT: .LBB107_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB107_5 +; GFX950-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB107_2 +; GFX950-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB107_5 +; GFX950-GISEL-NEXT: .LBB107_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5871,6 +11454,84 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB108_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB108_4 +; GFX950-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB108_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB108_2 +; GFX950-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB108_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB108_4 +; GFX950-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB108_2 +; GFX950-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, -1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic @@ -5968,6 +11629,90 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB109_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB109_4 +; GFX950-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB109_2 +; GFX950-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB109_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB109_4 +; GFX950-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB109_2 +; GFX950-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, -1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5975,4 +11720,2004 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ret void } +define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fadd_f64_saddr_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB110_5 +; GFX1250-SDAG-NEXT: s_branch .LBB110_6 +; GFX1250-SDAG-NEXT: .LBB110_3: +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_branch .LBB110_7 +; GFX1250-SDAG-NEXT: .LBB110_4: +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[4:5], s2 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: .LBB110_6: ; %Flow1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB110_8 +; GFX1250-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fadd_f64_saddr_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: .LBB110_3: ; %Flow +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 +; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[4:5], s2 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB110_5: ; %Flow1 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: .LBB110_6: ; %Flow2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_8 +; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_f64_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB110_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB110_4 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB110_5 +; GFX950-SDAG-NEXT: s_branch .LBB110_6 +; GFX950-SDAG-NEXT: .LBB110_3: +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_branch .LBB110_7 +; GFX950-SDAG-NEXT: .LBB110_4: +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; GFX950-SDAG-NEXT: .LBB110_6: ; %Flow1 +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB110_8 +; GFX950-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_f64_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX950-GISEL-NEXT: .LBB110_3: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 +; GFX950-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; GFX950-GISEL-NEXT: .LBB110_5: ; %Flow1 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX950-GISEL-NEXT: .LBB110_6: ; %Flow2 +; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB110_8 +; GFX950-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fadd_f64_saddr_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow2 +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_8 +; GFX1250-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_5 +; GFX1250-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX1250-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: .LBB111_5: ; %Flow +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 +; GFX1250-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_add_f64_e32 v[2:3], v[2:3], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[2:3], s2 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: .LBB111_7: ; %Flow1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB111_2 +; GFX1250-SDAG-NEXT: .LBB111_8: ; %atomicrmw.shared +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-SDAG-NEXT: ds_add_f64 v2, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fadd_f64_saddr_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: .LBB111_3: ; %Flow +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 +; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_f64_e32 v[2:3], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[2:3], s2 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB111_5: ; %Flow1 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: .LBB111_6: ; %Flow2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_8 +; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: ds_add_f64 v2, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_f64_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 +; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow2 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB111_8 +; GFX950-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB111_5 +; GFX950-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: .LBB111_5: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 +; GFX950-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[2:3], s2 +; GFX950-SDAG-NEXT: .LBB111_7: ; %Flow1 +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB111_2 +; GFX950-SDAG-NEXT: .LBB111_8: ; %atomicrmw.shared +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: ds_add_f64 v2, v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_f64_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX950-GISEL-NEXT: .LBB111_3: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 +; GFX950-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[2:3], s2 +; GFX950-GISEL-NEXT: .LBB111_5: ; %Flow1 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX950-GISEL-NEXT: .LBB111_6: ; %Flow2 +; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB111_8 +; GFX950-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: ds_add_f64 v2, v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fmax_f64_saddr_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB112_2 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB112_3 +; GFX1250-SDAG-NEXT: s_branch .LBB112_4 +; GFX1250-SDAG-NEXT: .LBB112_2: +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmax_f64_saddr_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB112_2: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_f64_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB112_2 +; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB112_3 +; GFX950-SDAG-NEXT: s_branch .LBB112_4 +; GFX950-SDAG-NEXT: .LBB112_2: +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_f64_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 +; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] offset:80 sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX950-GISEL-NEXT: .LBB112_2: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 +; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fmax_f64_saddr_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB113_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB113_4 +; GFX1250-SDAG-NEXT: .LBB113_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 +; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmax_f64_saddr_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB113_2: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB113_4: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_f64_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 +; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB113_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB113_4 +; GFX950-SDAG-NEXT: .LBB113_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_atomic_max_f64 v[2:3], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB113_2 +; GFX950-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_f64_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 +; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_max_f64 v[2:3], v[0:1] offset:80 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX950-GISEL-NEXT: .LBB113_2: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 +; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-GISEL-NEXT: .LBB113_4: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fmin_f64_saddr_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB114_2 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB114_3 +; GFX1250-SDAG-NEXT: s_branch .LBB114_4 +; GFX1250-SDAG-NEXT: .LBB114_2: +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmin_f64_saddr_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB114_2: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_f64_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB114_2 +; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB114_3 +; GFX950-SDAG-NEXT: s_branch .LBB114_4 +; GFX950-SDAG-NEXT: .LBB114_2: +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_f64_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 +; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] offset:80 sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX950-GISEL-NEXT: .LBB114_2: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 +; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-GISEL-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fmin_f64_saddr_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB115_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB115_4 +; GFX1250-SDAG-NEXT: .LBB115_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 +; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmin_f64_saddr_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB115_2: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB115_4: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_f64_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 +; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB115_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB115_4 +; GFX950-SDAG-NEXT: .LBB115_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_atomic_min_f64 v[2:3], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB115_2 +; GFX950-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_f64_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 +; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_min_f64 v[2:3], v[0:1] offset:80 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX950-GISEL-NEXT: .LBB115_2: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 +; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-GISEL-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-GISEL-NEXT: .LBB115_4: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @flat_atomic_fadd_f32_saddr_rtn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fadd_f32_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: flat_atomic_add_f32 v0, v[2:3], v0 offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_f32_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_add_f32 v0, v[2:3], v0 offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @flat_atomic_fadd_f32_saddr_nortn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fadd_f32_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_add_f32 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: flat_atomic_add_f32 v[2:3], v0 offset:40 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_f32_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_add_f32 v[2:3], v0 offset:40 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fmax_f32_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-GISEL-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB118_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fmax_f32_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_max_num_f32 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB119_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-GISEL-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB119_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fmin_f32_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-GISEL-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX950-GISEL-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB120_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fmin_f32_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_min_num_f32 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB121_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-GISEL-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-GISEL-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB121_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_pk_add_f16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[2:3], v0 offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[2:3], v0 offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_pk_add_f16 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: flat_atomic_pk_add_f16 v[2:3], v0 offset:40 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_pk_add_f16 v[2:3], v0 offset:40 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v5, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX1250-NEXT: v_pk_max_num_f16 v4, v0, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB124_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB124_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-GISEL-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB124_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB125_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB125_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-GISEL-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB125_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v5, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX1250-NEXT: v_pk_min_num_f16 v4, v0, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB126_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB126_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-GISEL-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB126_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB127_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB127_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-GISEL-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB127_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x bfloat> @flat_atomic_fadd_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fadd_v2bf16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_pk_add_bf16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_rtn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[2:3], v0 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @flat_atomic_fadd_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_pk_add_bf16 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-NEXT: flat_atomic_pk_add_bf16 v[2:3], v0 offset:40 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fmax_v2bf16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v5, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_bf16 v4, v5, v0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB130_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_rtn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v7, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB130_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @flat_atomic_fmax_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fmax_v2bf16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40 +; GFX1250-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v3, v0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB131_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_nortn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB131_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fmin_v2bf16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v5, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_min_num_bf16 v4, v5, v0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB132_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_rtn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v7, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB132_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @flat_atomic_fmin_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fmin_v2bf16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40 +; GFX1250-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v2, v3, v0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB133_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_nortn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB133_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + attributes #0 = { argmemonly nounwind willreturn } + +!0 = !{}