Skip to content

Commit 7a94d4f

Browse files
committed
Allow combining of extract_subvector to extract element
Differential Revision: https://reviews.llvm.org/D73132
1 parent c226646 commit 7a94d4f

File tree

7 files changed

+95
-102
lines changed

7 files changed

+95
-102
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18581,6 +18581,13 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
1858118581
V.getOperand(0), NewIndex);
1858218582
return DAG.getBitcast(NVT, NewExtract);
1858318583
}
18584+
if (NewExtNumElts == 1 &&
18585+
TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
18586+
SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
18587+
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
18588+
V.getOperand(0), NewIndex);
18589+
return DAG.getBitcast(NVT, NewExtract);
18590+
}
1858418591
}
1858518592
}
1858618593
}

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Lines changed: 55 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@ define <4 x half> @shuffle_v4f16_23uu(<4 x half> addrspace(1)* %arg0, <4 x half>
55
; GFX9-LABEL: shuffle_v4f16_23uu:
66
; GFX9: ; %bb.0:
77
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
8+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
99
; GFX9-NEXT: s_waitcnt vmcnt(0)
10-
; GFX9-NEXT: v_mov_b32_e32 v0, v1
1110
; GFX9-NEXT: s_setpc_b64 s[30:31]
1211
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
1312
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -19,10 +18,10 @@ define <4 x half> @shuffle_v4f16_234u(<4 x half> addrspace(1)* %arg0, <4 x half>
1918
; GFX9-LABEL: shuffle_v4f16_234u:
2019
; GFX9: ; %bb.0:
2120
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22-
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
23-
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
21+
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
22+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
2423
; GFX9-NEXT: s_waitcnt vmcnt(1)
25-
; GFX9-NEXT: v_mov_b32_e32 v0, v5
24+
; GFX9-NEXT: v_mov_b32_e32 v1, v2
2625
; GFX9-NEXT: s_waitcnt vmcnt(0)
2726
; GFX9-NEXT: s_setpc_b64 s[30:31]
2827
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@@ -154,7 +153,7 @@ define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half>
154153
; GFX9-LABEL: shuffle_v4f16_0101:
155154
; GFX9: ; %bb.0:
156155
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
156+
; GFX9-NEXT: global_load_dword v0, v[0:1], off
158157
; GFX9-NEXT: s_waitcnt vmcnt(0)
159158
; GFX9-NEXT: v_mov_b32_e32 v1, v0
160159
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -181,9 +180,8 @@ define <4 x half> @shuffle_v4f16_0145(<4 x half> addrspace(1)* %arg0, <4 x half>
181180
; GFX9-LABEL: shuffle_v4f16_0145:
182181
; GFX9: ; %bb.0:
183182
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
185-
; GFX9-NEXT: s_waitcnt vmcnt(0)
186-
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
183+
; GFX9-NEXT: global_load_dword v0, v[0:1], off
184+
; GFX9-NEXT: global_load_dword v1, v[2:3], off
187185
; GFX9-NEXT: s_waitcnt vmcnt(0)
188186
; GFX9-NEXT: s_setpc_b64 s[30:31]
189187
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@@ -196,11 +194,9 @@ define <4 x half> @shuffle_v4f16_0167(<4 x half> addrspace(1)* %arg0, <4 x half>
196194
; GFX9-LABEL: shuffle_v4f16_0167:
197195
; GFX9: ; %bb.0:
198196
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
199-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
200-
; GFX9-NEXT: s_waitcnt vmcnt(0)
201-
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
197+
; GFX9-NEXT: global_load_dword v0, v[0:1], off
198+
; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
202199
; GFX9-NEXT: s_waitcnt vmcnt(0)
203-
; GFX9-NEXT: v_mov_b32_e32 v1, v2
204200
; GFX9-NEXT: s_setpc_b64 s[30:31]
205201
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
206202
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -226,9 +222,9 @@ define <4 x half> @shuffle_v4f16_2323(<4 x half> addrspace(1)* %arg0, <4 x half>
226222
; GFX9-LABEL: shuffle_v4f16_2323:
227223
; GFX9: ; %bb.0:
228224
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
225+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
230226
; GFX9-NEXT: s_waitcnt vmcnt(0)
231-
; GFX9-NEXT: v_mov_b32_e32 v0, v1
227+
; GFX9-NEXT: v_mov_b32_e32 v1, v0
232228
; GFX9-NEXT: s_setpc_b64 s[30:31]
233229
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
234230
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -240,10 +236,8 @@ define <4 x half> @shuffle_v4f16_2345(<4 x half> addrspace(1)* %arg0, <4 x half>
240236
; GFX9-LABEL: shuffle_v4f16_2345:
241237
; GFX9: ; %bb.0:
242238
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243-
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
244-
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
245-
; GFX9-NEXT: s_waitcnt vmcnt(1)
246-
; GFX9-NEXT: v_mov_b32_e32 v0, v5
239+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
240+
; GFX9-NEXT: global_load_dword v1, v[2:3], off
247241
; GFX9-NEXT: s_waitcnt vmcnt(0)
248242
; GFX9-NEXT: s_setpc_b64 s[30:31]
249243
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@@ -256,10 +250,9 @@ define <4 x half> @shuffle_v4f16_2367(<4 x half> addrspace(1)* %arg0, <4 x half>
256250
; GFX9-LABEL: shuffle_v4f16_2367:
257251
; GFX9: ; %bb.0:
258252
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259-
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
260-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
253+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
254+
; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
261255
; GFX9-NEXT: s_waitcnt vmcnt(0)
262-
; GFX9-NEXT: v_mov_b32_e32 v0, v5
263256
; GFX9-NEXT: s_setpc_b64 s[30:31]
264257
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
265258
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -271,10 +264,11 @@ define <4 x half> @shuffle_v4f16_4501(<4 x half> addrspace(1)* %arg0, <4 x half>
271264
; GFX9-LABEL: shuffle_v4f16_4501:
272265
; GFX9: ; %bb.0:
273266
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274-
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
275-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
267+
; GFX9-NEXT: global_load_dword v2, v[2:3], off
268+
; GFX9-NEXT: global_load_dword v1, v[0:1], off
269+
; GFX9-NEXT: s_waitcnt vmcnt(1)
270+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
276271
; GFX9-NEXT: s_waitcnt vmcnt(0)
277-
; GFX9-NEXT: v_mov_b32_e32 v1, v4
278272
; GFX9-NEXT: s_setpc_b64 s[30:31]
279273
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
280274
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -286,10 +280,11 @@ define <4 x half> @shuffle_v4f16_4523(<4 x half> addrspace(1)* %arg0, <4 x half>
286280
; GFX9-LABEL: shuffle_v4f16_4523:
287281
; GFX9: ; %bb.0:
288282
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
290-
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
291-
; GFX9-NEXT: s_waitcnt vmcnt(0)
283+
; GFX9-NEXT: global_load_dword v2, v[2:3], off
284+
; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
285+
; GFX9-NEXT: s_waitcnt vmcnt(1)
292286
; GFX9-NEXT: v_mov_b32_e32 v0, v2
287+
; GFX9-NEXT: s_waitcnt vmcnt(0)
293288
; GFX9-NEXT: s_setpc_b64 s[30:31]
294289
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
295290
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -301,7 +296,7 @@ define <4 x half> @shuffle_v4f16_4545(<4 x half> addrspace(1)* %arg0, <4 x half>
301296
; GFX9-LABEL: shuffle_v4f16_4545:
302297
; GFX9: ; %bb.0:
303298
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
299+
; GFX9-NEXT: global_load_dword v0, v[2:3], off
305300
; GFX9-NEXT: s_waitcnt vmcnt(0)
306301
; GFX9-NEXT: v_mov_b32_e32 v1, v0
307302
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -328,11 +323,11 @@ define <4 x half> @shuffle_v4f16_6701(<4 x half> addrspace(1)* %arg0, <4 x half>
328323
; GFX9-LABEL: shuffle_v4f16_6701:
329324
; GFX9: ; %bb.0:
330325
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
331-
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
332-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
326+
; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
327+
; GFX9-NEXT: global_load_dword v1, v[0:1], off
328+
; GFX9-NEXT: s_waitcnt vmcnt(1)
329+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
333330
; GFX9-NEXT: s_waitcnt vmcnt(0)
334-
; GFX9-NEXT: v_mov_b32_e32 v0, v1
335-
; GFX9-NEXT: v_mov_b32_e32 v1, v4
336331
; GFX9-NEXT: s_setpc_b64 s[30:31]
337332
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
338333
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -344,10 +339,11 @@ define <4 x half> @shuffle_v4f16_6723(<4 x half> addrspace(1)* %arg0, <4 x half>
344339
; GFX9-LABEL: shuffle_v4f16_6723:
345340
; GFX9: ; %bb.0:
346341
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
348-
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
342+
; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4
343+
; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
344+
; GFX9-NEXT: s_waitcnt vmcnt(1)
345+
; GFX9-NEXT: v_mov_b32_e32 v0, v2
349346
; GFX9-NEXT: s_waitcnt vmcnt(0)
350-
; GFX9-NEXT: v_mov_b32_e32 v0, v3
351347
; GFX9-NEXT: s_setpc_b64 s[30:31]
352348
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
353349
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -373,9 +369,9 @@ define <4 x half> @shuffle_v4f16_6767(<4 x half> addrspace(1)* %arg0, <4 x half>
373369
; GFX9-LABEL: shuffle_v4f16_6767:
374370
; GFX9: ; %bb.0:
375371
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off
372+
; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4
377373
; GFX9-NEXT: s_waitcnt vmcnt(0)
378-
; GFX9-NEXT: v_mov_b32_e32 v0, v1
374+
; GFX9-NEXT: v_mov_b32_e32 v1, v0
379375
; GFX9-NEXT: s_setpc_b64 s[30:31]
380376
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
381377
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -388,13 +384,12 @@ define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half>
388384
; GFX9: ; %bb.0:
389385
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
390386
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
391-
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
392-
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
387+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
388+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
393389
; GFX9-NEXT: s_waitcnt vmcnt(1)
394-
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
395-
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0
390+
; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
391+
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
396392
; GFX9-NEXT: s_waitcnt vmcnt(0)
397-
; GFX9-NEXT: v_mov_b32_e32 v0, v5
398393
; GFX9-NEXT: s_setpc_b64 s[30:31]
399394
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
400395
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -407,11 +402,12 @@ define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half>
407402
; GFX9: ; %bb.0:
408403
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
409404
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
410-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
411-
; GFX9-NEXT: s_waitcnt vmcnt(0)
405+
; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4
412406
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
407+
; GFX9-NEXT: s_waitcnt vmcnt(1)
413408
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
414409
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
410+
; GFX9-NEXT: s_waitcnt vmcnt(0)
415411
; GFX9-NEXT: s_setpc_b64 s[30:31]
416412
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
417413
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -485,13 +481,12 @@ define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad
485481
; GFX9: ; %bb.0:
486482
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
487483
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
488-
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
489-
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
484+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
485+
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
490486
; GFX9-NEXT: s_waitcnt vmcnt(1)
491-
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
492-
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0
487+
; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
488+
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
493489
; GFX9-NEXT: s_waitcnt vmcnt(0)
494-
; GFX9-NEXT: v_mov_b32_e32 v0, v5
495490
; GFX9-NEXT: s_setpc_b64 s[30:31]
496491
%val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
497492
%val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
@@ -503,11 +498,9 @@ define <4 x i16> @shuffle_v4i16_0167(<4 x i16> addrspace(1)* %arg0, <4 x i16> ad
503498
; GFX9-LABEL: shuffle_v4i16_0167:
504499
; GFX9: ; %bb.0:
505500
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
506-
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
507-
; GFX9-NEXT: s_waitcnt vmcnt(0)
508-
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off
501+
; GFX9-NEXT: global_load_dword v0, v[0:1], off
502+
; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4
509503
; GFX9-NEXT: s_waitcnt vmcnt(0)
510-
; GFX9-NEXT: v_mov_b32_e32 v1, v2
511504
; GFX9-NEXT: s_setpc_b64 s[30:31]
512505
%val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0
513506
%val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1
@@ -590,12 +583,11 @@ define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half>
590583
; GFX9-LABEL: shuffle_v4f16_2333:
591584
; GFX9: ; %bb.0:
592585
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593-
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
586+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
594587
; GFX9-NEXT: s_waitcnt vmcnt(0)
595-
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
596-
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
597-
; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
598-
; GFX9-NEXT: v_mov_b32_e32 v0, v2
588+
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
589+
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
590+
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
599591
; GFX9-NEXT: s_setpc_b64 s[30:31]
600592
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
601593
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@@ -607,12 +599,11 @@ define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half>
607599
; GFX9-LABEL: shuffle_v4f16_6667:
608600
; GFX9: ; %bb.0:
609601
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610-
; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off
602+
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
611603
; GFX9-NEXT: s_waitcnt vmcnt(0)
612-
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v2
613-
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
614-
; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
615-
; GFX9-NEXT: v_mov_b32_e32 v0, v2
604+
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
605+
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
606+
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
616607
; GFX9-NEXT: s_setpc_b64 s[30:31]
617608
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
618609
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1

llvm/test/CodeGen/ARM/vdup.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,8 +429,8 @@ define <4 x i32> @tduplane(<4 x i32> %invec) {
429429
define <2 x float> @check_f32(<4 x float> %v) nounwind {
430430
; CHECK-LABEL: check_f32:
431431
; CHECK: @ %bb.0:
432-
; CHECK-NEXT: vmov d17, r2, r3
433-
; CHECK-NEXT: vdup.32 d16, d17[1]
432+
; CHECK-NEXT: vmov d16, r2, r3
433+
; CHECK-NEXT: vdup.32 d16, d16[1]
434434
; CHECK-NEXT: vmov r0, r1, d16
435435
; CHECK-NEXT: mov pc, lr
436436
%x = extractelement <4 x float> %v, i32 3
@@ -442,8 +442,8 @@ define <2 x float> @check_f32(<4 x float> %v) nounwind {
442442
define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
443443
; CHECK-LABEL: check_i32:
444444
; CHECK: @ %bb.0:
445-
; CHECK-NEXT: vmov d17, r2, r3
446-
; CHECK-NEXT: vdup.32 d16, d17[1]
445+
; CHECK-NEXT: vmov d16, r2, r3
446+
; CHECK-NEXT: vdup.32 d16, d16[1]
447447
; CHECK-NEXT: vmov r0, r1, d16
448448
; CHECK-NEXT: mov pc, lr
449449
%x = extractelement <4 x i32> %v, i32 3

llvm/test/CodeGen/ARM/vext.ll

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,10 @@ define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
183183
; CHECK: @ %bb.0:
184184
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
185185
; CHECK-NEXT: vext.16 d16, d16, d17, #3
186-
; CHECK-NEXT: vorr d17, d16, d16
187-
; CHECK-NEXT: vld1.64 {d18, d19}, [r1]
188-
; CHECK-NEXT: vuzp.16 d16, d17
189-
; CHECK-NEXT: vzip.16 d16, d18
186+
; CHECK-NEXT: vorr d18, d16, d16
187+
; CHECK-NEXT: vldr d17, [r1]
188+
; CHECK-NEXT: vuzp.16 d16, d18
189+
; CHECK-NEXT: vzip.16 d16, d17
190190
; CHECK-NEXT: vmov r0, r1, d16
191191
; CHECK-NEXT: mov pc, lr
192192
%tmp1 = load <8 x i16>, <8 x i16>* %A
@@ -216,17 +216,15 @@ define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
216216
define <4 x i16> @test_multisource(<32 x i16>* %B) nounwind {
217217
; CHECK-LABEL: test_multisource:
218218
; CHECK: @ %bb.0:
219+
; CHECK-NEXT: vldr d18, [r0, #32]
219220
; CHECK-NEXT: mov r1, r0
220-
; CHECK-NEXT: add r2, r0, #48
221-
; CHECK-NEXT: add r0, r0, #32
221+
; CHECK-NEXT: vorr d22, d18, d18
222222
; CHECK-NEXT: vld1.16 {d16, d17}, [r1:128]!
223-
; CHECK-NEXT: vld1.64 {d20, d21}, [r0:128]
224-
; CHECK-NEXT: vorr d24, d20, d20
225-
; CHECK-NEXT: vld1.64 {d18, d19}, [r2:128]
226-
; CHECK-NEXT: vld1.64 {d22, d23}, [r1:128]
227-
; CHECK-NEXT: vzip.16 d24, d18
228-
; CHECK-NEXT: vtrn.16 q8, q11
229-
; CHECK-NEXT: vext.16 d18, d20, d24, #2
223+
; CHECK-NEXT: vldr d19, [r0, #48]
224+
; CHECK-NEXT: vld1.64 {d20, d21}, [r1:128]
225+
; CHECK-NEXT: vzip.16 d22, d19
226+
; CHECK-NEXT: vtrn.16 q8, q10
227+
; CHECK-NEXT: vext.16 d18, d18, d22, #2
230228
; CHECK-NEXT: vext.16 d16, d18, d16, #2
231229
; CHECK-NEXT: vext.16 d16, d16, d16, #2
232230
; CHECK-NEXT: vmov r0, r1, d16

llvm/test/CodeGen/ARM/vpadd.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -285,11 +285,11 @@ define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ss
285285
define void @addCombineToVPADDL_s8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
286286
; CHECK-LABEL: addCombineToVPADDL_s8:
287287
; CHECK: @ %bb.0:
288-
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
289-
; CHECK-NEXT: vext.8 d18, d16, d16, #1
288+
; CHECK-NEXT: vldr d16, [r0]
289+
; CHECK-NEXT: vext.8 d17, d16, d16, #1
290290
; CHECK-NEXT: vshl.i16 d16, d16, #8
291-
; CHECK-NEXT: vshl.i16 d18, d18, #8
292-
; CHECK-NEXT: vshr.s16 d17, d18, #8
291+
; CHECK-NEXT: vshl.i16 d17, d17, #8
292+
; CHECK-NEXT: vshr.s16 d17, d17, #8
293293
; CHECK-NEXT: vsra.s16 d17, d16, #8
294294
; CHECK-NEXT: vstr d17, [r1]
295295
; CHECK-NEXT: mov pc, lr
@@ -347,11 +347,11 @@ define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X)
347347
define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
348348
; CHECK-LABEL: addCombineToVPADDL_u8:
349349
; CHECK: @ %bb.0:
350-
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
351-
; CHECK-NEXT: vext.8 d18, d16, d16, #1
350+
; CHECK-NEXT: vldr d16, [r0]
351+
; CHECK-NEXT: vext.8 d17, d16, d16, #1
352352
; CHECK-NEXT: vbic.i16 d16, #0xff00
353-
; CHECK-NEXT: vbic.i16 d18, #0xff00
354-
; CHECK-NEXT: vadd.i16 d16, d18, d16
353+
; CHECK-NEXT: vbic.i16 d17, #0xff00
354+
; CHECK-NEXT: vadd.i16 d16, d17, d16
355355
; CHECK-NEXT: vstr d16, [r1]
356356
; CHECK-NEXT: mov pc, lr
357357
%tmp = load <16 x i8>, <16 x i8>* %cbcr
@@ -368,7 +368,7 @@ define void @addCombineToVPADDL_u8(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp
368368
define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp {
369369
; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
370370
; CHECK: @ %bb.0:
371-
; CHECK-NEXT: vld1.64 {d16, d17}, [r0]
371+
; CHECK-NEXT: vldr d16, [r0]
372372
; CHECK-NEXT: vmovl.u8 q8, d16
373373
; CHECK-NEXT: vpadd.i16 d16, d16, d17
374374
; CHECK-NEXT: vstr d16, [r1]

0 commit comments

Comments
 (0)