Skip to content

Commit e16a713

Browse files
committed
AMDGPU: Select global atomicrmw fadd
This only works if there is no use of the return value.
1 parent 9f9f42d commit e16a713

File tree

7 files changed

+195
-13
lines changed

7 files changed

+195
-13
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4343,7 +4343,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
43434343
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
43444344
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
43454345
NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
4346-
NODE_NAME_CASE(ATOMIC_FADD)
43474346
NODE_NAME_CASE(ATOMIC_PK_FADD)
43484347

43494348
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -531,7 +531,6 @@ enum NodeType : unsigned {
531531
BUFFER_ATOMIC_CMPSWAP,
532532
BUFFER_ATOMIC_FADD,
533533
BUFFER_ATOMIC_PK_FADD,
534-
ATOMIC_FADD,
535534
ATOMIC_PK_FADD,
536535

537536
LAST_AMDGPU_ISD_NUMBER

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -364,11 +364,12 @@ multiclass FLAT_Global_Atomic_Pseudo<
364364
string opName,
365365
RegisterClass vdst_rc,
366366
ValueType vt,
367-
SDPatternOperator atomic = null_frag,
367+
SDPatternOperator atomic_rtn = null_frag,
368+
SDPatternOperator atomic_no_rtn = null_frag,
368369
ValueType data_vt = vt,
369370
RegisterClass data_rc = vdst_rc> :
370-
FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>,
371-
FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic, data_vt, data_rc>;
371+
FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic_no_rtn, data_vt, data_rc>,
372+
FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>;
372373

373374

374375
//===----------------------------------------------------------------------===//
@@ -535,11 +536,12 @@ defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d
535536

536537
let is_flat_global = 1 in {
537538
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap",
538-
VGPR_32, i32, AMDGPUatomic_cmp_swap_global_32,
539+
VGPR_32, i32, AMDGPUatomic_cmp_swap_global_32, null_frag,
539540
v2i32, VReg_64>;
540541

541542
defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswap_x2",
542543
VReg_64, i64, AMDGPUatomic_cmp_swap_global_64,
544+
null_frag,
543545
v2i64, VReg_128>;
544546

545547
defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7094,13 +7094,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
70947094
EVT VT = Op.getOperand(3).getValueType();
70957095

70967096
auto *M = cast<MemSDNode>(Op);
7097-
unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
7098-
: AMDGPUISD::ATOMIC_FADD;
7097+
if (VT.isVector()) {
7098+
return DAG.getMemIntrinsicNode(
7099+
AMDGPUISD::ATOMIC_PK_FADD, DL, Op->getVTList(), Ops, VT,
7100+
M->getMemOperand());
7101+
}
70997102

7100-
return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
7101-
M->getMemOperand());
7103+
return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
7104+
DAG.getVTList(VT, MVT::Other), Ops,
7105+
M->getMemOperand()).getValue(1);
71027106
}
7103-
71047107
case Intrinsic::amdgcn_end_cf:
71057108
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
71067109
Op->getOperand(2), Chain), 0);
@@ -10936,6 +10939,12 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1093610939

1093710940
// TODO: Do have these for flat. Older targets also had them for buffers.
1093810941
unsigned AS = RMW->getPointerAddressSpace();
10942+
10943+
if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
10944+
return RMW->use_empty() ? AtomicExpansionKind::None :
10945+
AtomicExpansionKind::CmpXChg;
10946+
}
10947+
1093910948
return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
1094010949
AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
1094110950
}

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,6 @@ class SDGlobalAtomicNoRtn<string opcode, ValueType ty> : SDNode <opcode,
225225
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
226226
>;
227227

228-
def SIglobal_atomic_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_FADD", f32>;
229228
def SIglobal_atomic_pk_fadd : SDGlobalAtomicNoRtn <"AMDGPUISD::ATOMIC_PK_FADD", v2f16>;
230229

231230
def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
@@ -324,7 +323,7 @@ defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
324323

325324
def atomic_fadd_global_noret : PatFrag<
326325
(ops node:$ptr, node:$value),
327-
(SIglobal_atomic_fadd node:$ptr, node:$value)> {
326+
(atomic_load_fadd node:$ptr, node:$value)> {
328327
// FIXME: Move this
329328
let MemoryVT = f32;
330329
let IsAtomic = 1;
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
2+
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
3+
4+
; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32:
5+
; GCN: [[LOOP:BB[0-9]+_[0-9]+]]
6+
; GCN: v_add_f32_e32
7+
; GCN: global_atomic_cmpswap
8+
; GCN: s_andn2_b64 exec, exec,
9+
; GCN-NEXT: s_cbranch_execnz [[LOOP]]
10+
define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) {
11+
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
12+
store float %result, float addrspace(1)* undef
13+
ret void
14+
}
15+
16+
; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32:
17+
; GFX900: [[LOOP:BB[0-9]+_[0-9]+]]
18+
; GFX900: v_add_f32_e32
19+
; GFX900: global_atomic_cmpswap
20+
; GFX900: s_andn2_b64 exec, exec,
21+
; GFX900-NEXT: s_cbranch_execnz [[LOOP]]
22+
23+
; GFX908-NOT: v_add_f32
24+
; GFX908: global_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off
25+
; GFX908-NOT: s_cbranch_execnz
26+
define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) {
27+
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
28+
ret void
29+
}

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -atomic-expand %s | FileCheck -check-prefix=CI %s
33
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -atomic-expand %s | FileCheck -check-prefix=GFX9 %s
4+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -atomic-expand %s | FileCheck -check-prefix=GFX908 %s
45

56
define float @test_atomicrmw_fadd_f32_flat(float* %ptr, float %value) {
67
; CI-LABEL: @test_atomicrmw_fadd_f32_flat(
@@ -36,6 +37,23 @@ define float @test_atomicrmw_fadd_f32_flat(float* %ptr, float %value) {
3637
; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
3738
; GFX9: atomicrmw.end:
3839
; GFX9-NEXT: ret float [[TMP6]]
40+
;
41+
; GFX908-LABEL: @test_atomicrmw_fadd_f32_flat(
42+
; GFX908-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4
43+
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
44+
; GFX908: atomicrmw.start:
45+
; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
46+
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
47+
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32*
48+
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
49+
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
50+
; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
51+
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
52+
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
53+
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
54+
; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
55+
; GFX908: atomicrmw.end:
56+
; GFX908-NEXT: ret float [[TMP6]]
3957
;
4058
%res = atomicrmw fadd float* %ptr, float %value seq_cst
4159
ret float %res
@@ -75,11 +93,71 @@ define float @test_atomicrmw_fadd_f32_global(float addrspace(1)* %ptr, float %va
7593
; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
7694
; GFX9: atomicrmw.end:
7795
; GFX9-NEXT: ret float [[TMP6]]
96+
;
97+
; GFX908-LABEL: @test_atomicrmw_fadd_f32_global(
98+
; GFX908-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4
99+
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
100+
; GFX908: atomicrmw.start:
101+
; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
102+
; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
103+
; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)*
104+
; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
105+
; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
106+
; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
107+
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
108+
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
109+
; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
110+
; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
111+
; GFX908: atomicrmw.end:
112+
; GFX908-NEXT: ret float [[TMP6]]
78113
;
79114
%res = atomicrmw fadd float addrspace(1)* %ptr, float %value seq_cst
80115
ret float %res
81116
}
82117

118+
define void @test_atomicrmw_fadd_f32_global_no_use(float addrspace(1)* %ptr, float %value) {
119+
; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use(
120+
; CI-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4
121+
; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
122+
; CI: atomicrmw.start:
123+
; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
124+
; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
125+
; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)*
126+
; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
127+
; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
128+
; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
129+
; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
130+
; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
131+
; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
132+
; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
133+
; CI: atomicrmw.end:
134+
; CI-NEXT: ret void
135+
;
136+
; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_no_use(
137+
; GFX9-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4
138+
; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
139+
; GFX9: atomicrmw.start:
140+
; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
141+
; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
142+
; GFX9-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)*
143+
; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
144+
; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
145+
; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
146+
; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
147+
; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
148+
; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
149+
; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
150+
; GFX9: atomicrmw.end:
151+
; GFX9-NEXT: ret void
152+
;
153+
; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use(
154+
; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst
155+
; GFX908-NEXT: ret void
156+
;
157+
%res = atomicrmw fadd float addrspace(1)* %ptr, float %value seq_cst
158+
ret void
159+
}
160+
83161
define float @test_atomicrmw_fadd_f32_local(float addrspace(3)* %ptr, float %value) {
84162
; CI-LABEL: @test_atomicrmw_fadd_f32_local(
85163
; CI-NEXT: [[TMP1:%.*]] = load float, float addrspace(3)* [[PTR:%.*]], align 4
@@ -101,6 +179,10 @@ define float @test_atomicrmw_fadd_f32_local(float addrspace(3)* %ptr, float %val
101179
; GFX9-LABEL: @test_atomicrmw_fadd_f32_local(
102180
; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(3)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst
103181
; GFX9-NEXT: ret float [[RES]]
182+
;
183+
; GFX908-LABEL: @test_atomicrmw_fadd_f32_local(
184+
; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(3)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst
185+
; GFX908-NEXT: ret float [[RES]]
104186
;
105187
%res = atomicrmw fadd float addrspace(3)* %ptr, float %value seq_cst
106188
ret float %res
@@ -114,6 +196,10 @@ define half @test_atomicrmw_fadd_f16_flat(half* %ptr, half %value) {
114196
; GFX9-LABEL: @test_atomicrmw_fadd_f16_flat(
115197
; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst
116198
; GFX9-NEXT: ret half [[RES]]
199+
;
200+
; GFX908-LABEL: @test_atomicrmw_fadd_f16_flat(
201+
; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst
202+
; GFX908-NEXT: ret half [[RES]]
117203
;
118204
%res = atomicrmw fadd half* %ptr, half %value seq_cst
119205
ret half %res
@@ -127,6 +213,10 @@ define half @test_atomicrmw_fadd_f16_global(half addrspace(1)* %ptr, half %value
127213
; GFX9-LABEL: @test_atomicrmw_fadd_f16_global(
128214
; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst
129215
; GFX9-NEXT: ret half [[RES]]
216+
;
217+
; GFX908-LABEL: @test_atomicrmw_fadd_f16_global(
218+
; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(1)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst
219+
; GFX908-NEXT: ret half [[RES]]
130220
;
131221
%res = atomicrmw fadd half addrspace(1)* %ptr, half %value seq_cst
132222
ret half %res
@@ -140,6 +230,10 @@ define half @test_atomicrmw_fadd_f16_local(half addrspace(3)* %ptr, half %value)
140230
; GFX9-LABEL: @test_atomicrmw_fadd_f16_local(
141231
; GFX9-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst
142232
; GFX9-NEXT: ret half [[RES]]
233+
;
234+
; GFX908-LABEL: @test_atomicrmw_fadd_f16_local(
235+
; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd half addrspace(3)* [[PTR:%.*]], half [[VALUE:%.*]] seq_cst
236+
; GFX908-NEXT: ret half [[RES]]
143237
;
144238
%res = atomicrmw fadd half addrspace(3)* %ptr, half %value seq_cst
145239
ret half %res
@@ -179,6 +273,23 @@ define double @test_atomicrmw_fadd_f64_flat(double* %ptr, double %value) {
179273
; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
180274
; GFX9: atomicrmw.end:
181275
; GFX9-NEXT: ret double [[TMP6]]
276+
;
277+
; GFX908-LABEL: @test_atomicrmw_fadd_f64_flat(
278+
; GFX908-NEXT: [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8
279+
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
280+
; GFX908: atomicrmw.start:
281+
; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
282+
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
283+
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double* [[PTR]] to i64*
284+
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64
285+
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
286+
; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst
287+
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
288+
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
289+
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
290+
; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
291+
; GFX908: atomicrmw.end:
292+
; GFX908-NEXT: ret double [[TMP6]]
182293
;
183294
%res = atomicrmw fadd double* %ptr, double %value seq_cst
184295
ret double %res
@@ -218,6 +329,23 @@ define double @test_atomicrmw_fadd_f64_global(double addrspace(1)* %ptr, double
218329
; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
219330
; GFX9: atomicrmw.end:
220331
; GFX9-NEXT: ret double [[TMP6]]
332+
;
333+
; GFX908-LABEL: @test_atomicrmw_fadd_f64_global(
334+
; GFX908-NEXT: [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8
335+
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
336+
; GFX908: atomicrmw.start:
337+
; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
338+
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
339+
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)*
340+
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64
341+
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
342+
; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst
343+
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
344+
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
345+
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
346+
; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
347+
; GFX908: atomicrmw.end:
348+
; GFX908-NEXT: ret double [[TMP6]]
221349
;
222350
%res = atomicrmw fadd double addrspace(1)* %ptr, double %value seq_cst
223351
ret double %res
@@ -257,6 +385,23 @@ define double @test_atomicrmw_fadd_f64_local(double addrspace(3)* %ptr, double %
257385
; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
258386
; GFX9: atomicrmw.end:
259387
; GFX9-NEXT: ret double [[TMP6]]
388+
;
389+
; GFX908-LABEL: @test_atomicrmw_fadd_f64_local(
390+
; GFX908-NEXT: [[TMP1:%.*]] = load double, double addrspace(3)* [[PTR:%.*]], align 8
391+
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
392+
; GFX908: atomicrmw.start:
393+
; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
394+
; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
395+
; GFX908-NEXT: [[TMP2:%.*]] = bitcast double addrspace(3)* [[PTR]] to i64 addrspace(3)*
396+
; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64
397+
; GFX908-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
398+
; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(3)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst
399+
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
400+
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
401+
; GFX908-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
402+
; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
403+
; GFX908: atomicrmw.end:
404+
; GFX908-NEXT: ret double [[TMP6]]
260405
;
261406
%res = atomicrmw fadd double addrspace(3)* %ptr, double %value seq_cst
262407
ret double %res

0 commit comments

Comments
 (0)